In [1]:
import sys
import os
import time
import numpy as np
import pandas as pd
import pickle
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import f_classif
from itertools import combinations
from cuml.ensemble import RandomForestClassifier as cuRandomForestClassifier

import cupy as cp
import cudf
from cuml.linear_model import LogisticRegression as cuMLLogisticRegression


print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2

from src.common.utils import load_config_file
from src.embeddings.embeddings_utils import load_embeddings
from src.analysis.analyzer_multiplex_markers import AnalyzerMultiplexMarkers
from utils import *
NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
In [ ]:
 
In [18]:
## Baseline

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

for test_batches in batches:
    test_batches = [test_batches]
    train_batches = list(set(batches)-set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
    count_labels(y_test)

    # Encode with same label encoder
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)

    # Optional: balance training set
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # Convert to GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_enc = cudf.Series(y_train_enc)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_enc)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_enc, y_pred, target_names=le.classes_, output_dict=True)
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))    
    plot_confusion_matrix(y_test_enc, y_pred, le)
    accuracy = report['accuracy']
    accuracies.append(accuracy)
    # Accumulate confusion matrix
    cm = confusion_matrix(y_test_enc, y_pred, labels=np.arange(len(le.classes_)))
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm
print(np.mean(accuracies), accuracies)
display_labels = [label.replace('_Untreated', '') for label in le.classes_]
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=display_labels)
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(41469,) (41469, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.66      0.91      0.76      1222
  FUSHomozygous_Untreated       0.87      0.53      0.66      1245
   FUSRevertant_Untreated       0.81      0.91      0.86      1015
           OPTN_Untreated       0.79      0.51      0.62      2314
           TBK1_Untreated       0.12      0.00      0.00      1876
          TDP43_Untreated       0.28      0.31      0.29      1699
             WT_Untreated       0.28      0.71      0.40      1561

                 accuracy                           0.50     10932
                macro avg       0.54      0.55      0.51     10932
             weighted avg       0.52      0.50      0.47     10932

Train dataset
batches [1, 3, 7, 8, 9, 10]
(44045,) (44045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.63      0.64      0.63      1231
  FUSHomozygous_Untreated       0.61      0.58      0.60      1162
   FUSRevertant_Untreated       0.47      0.99      0.64       800
           OPTN_Untreated       0.57      0.65      0.61      1649
           TBK1_Untreated       0.64      0.61      0.63      1220
          TDP43_Untreated       0.39      0.15      0.22      1508
             WT_Untreated       0.61      0.55      0.58       786

                 accuracy                           0.57      8356
                macro avg       0.56      0.60      0.56      8356
             weighted avg       0.56      0.57      0.54      8356

Train dataset
batches [1, 2, 7, 8, 9, 10]
(45470,) (45470, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.74      0.97      0.84      1004
  FUSHomozygous_Untreated       0.94      0.57      0.71       800
   FUSRevertant_Untreated       0.70      0.73      0.71      1131
           OPTN_Untreated       0.48      0.40      0.43      1103
           TBK1_Untreated       0.42      0.82      0.56      1045
          TDP43_Untreated       0.49      0.38      0.43       930
             WT_Untreated       0.57      0.17      0.26       918

                 accuracy                           0.59      6931
                macro avg       0.62      0.58      0.56      6931
             weighted avg       0.61      0.59      0.57      6931

Train dataset
batches [1, 2, 3, 8, 9, 10]
(52238,) (52238, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Test dataset
batches [7]
(163,) (163, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
FUSRevertant_Untreated: 44
OPTN_Untreated: 5
TBK1_Untreated: 13
TDP43_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.00      0.00      0.00        26
  FUSHomozygous_Untreated       0.00      0.00      0.00        25
   FUSRevertant_Untreated       0.00      0.00      0.00        44
           OPTN_Untreated       0.07      0.80      0.12         5
           TBK1_Untreated       0.50      0.08      0.13        13
          TDP43_Untreated       0.07      0.23      0.11        13
             WT_Untreated       0.61      0.97      0.75        37

                 accuracy                           0.27       163
                macro avg       0.18      0.30      0.16       163
             weighted avg       0.19      0.27      0.19       163

Train dataset
batches [1, 2, 3, 7, 9, 10]
(42851,) (42851, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.81      0.19      0.31      1567
  FUSHomozygous_Untreated       0.54      0.95      0.69      1562
   FUSRevertant_Untreated       0.65      0.44      0.53      1163
           OPTN_Untreated       0.43      0.24      0.31      1429
           TBK1_Untreated       0.73      0.14      0.24       755
          TDP43_Untreated       0.29      0.64      0.40      1564
             WT_Untreated       0.43      0.36      0.39      1510

                 accuracy                           0.45      9550
                macro avg       0.55      0.42      0.41      9550
             weighted avg       0.54      0.45      0.42      9550

Train dataset
batches [1, 2, 3, 7, 8, 10]
(43208,) (43208, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.19      0.27      0.22       963
  FUSHomozygous_Untreated       0.28      0.54      0.37       619
   FUSRevertant_Untreated       0.80      0.11      0.19      1298
           OPTN_Untreated       0.18      0.43      0.26      1586
           TBK1_Untreated       0.34      0.24      0.28       984
          TDP43_Untreated       0.10      0.09      0.09      1439
             WT_Untreated       0.94      0.26      0.41      2304

                 accuracy                           0.26      9193
                macro avg       0.40      0.28      0.26      9193
             weighted avg       0.47      0.26      0.27      9193

Train dataset
batches [1, 2, 3, 7, 8, 9]
(45125,) (45125, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.58      0.21      0.31       267
  FUSHomozygous_Untreated       0.75      0.94      0.84       666
   FUSRevertant_Untreated       0.05      0.87      0.09        45
           OPTN_Untreated       0.96      0.03      0.05      1763
           TBK1_Untreated       0.28      0.45      0.34       188
          TDP43_Untreated       0.09      0.01      0.01      2151
             WT_Untreated       0.43      0.98      0.60      2196

                 accuracy                           0.42      7276
                macro avg       0.45      0.50      0.32      7276
             weighted avg       0.49      0.42      0.29      7276

0.43597069158017904 [0.5032016099524332, 0.5671373863092388, 0.5864954552012697, 0.26993865030674846, 0.4500523560209424, 0.2592189709561623, 0.4157504123144585]
In [5]:
## L2 norm => doesnt improve

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

for test_batches in batches:
    test_batches = [test_batches]
    train_batches = list(set(batches)-set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)
    
    # L2 normalize each sample
    X_train /= np.linalg.norm(X_train, axis=1, keepdims=True)
    X_test /= np.linalg.norm(X_test, axis=1, keepdims=True)

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
    count_labels(y_test)

    # Encode with same label encoder
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)

    # Optional: balance training set
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # Convert to GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_enc = cudf.Series(y_train_enc)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_enc)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_enc, y_pred, target_names=le.classes_, output_dict=True)
    print(classification_report(y_test_enc, y_pred, target_names=le.classes_))    
    plot_confusion_matrix(y_test_enc, y_pred, le)
    accuracy = report['accuracy']
    accuracies.append(accuracy)
    # Accumulate confusion matrix
    cm = confusion_matrix(y_test_enc, y_pred, labels=np.arange(len(le.classes_)))
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm
print(np.mean(accuracies), accuracies)
display_labels = [label.replace('_Untreated', '') for label in le.classes_]
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=display_labels)
disp.plot(xticks_rotation=45)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(41469,) (41469, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.60      0.93      0.73      1222
  FUSHomozygous_Untreated       0.86      0.39      0.54      1245
   FUSRevertant_Untreated       0.77      0.88      0.82      1015
           OPTN_Untreated       0.71      0.48      0.57      2314
           TBK1_Untreated       0.11      0.00      0.01      1876
          TDP43_Untreated       0.28      0.24      0.26      1699
             WT_Untreated       0.22      0.60      0.32      1561

                 accuracy                           0.46     10932
                macro avg       0.51      0.50      0.46     10932
             weighted avg       0.48      0.46      0.43     10932

Train dataset
batches [1, 3, 7, 8, 9, 10]
(44045,) (44045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.61      0.59      0.60      1231
  FUSHomozygous_Untreated       0.60      0.57      0.58      1162
   FUSRevertant_Untreated       0.44      0.98      0.60       800
           OPTN_Untreated       0.54      0.68      0.60      1649
           TBK1_Untreated       0.59      0.56      0.57      1220
          TDP43_Untreated       0.39      0.11      0.17      1508
             WT_Untreated       0.62      0.47      0.54       786

                 accuracy                           0.54      8356
                macro avg       0.54      0.57      0.52      8356
             weighted avg       0.53      0.54      0.51      8356

Train dataset
batches [1, 2, 7, 8, 9, 10]
(45470,) (45470, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.73      0.95      0.82      1004
  FUSHomozygous_Untreated       0.91      0.56      0.69       800
   FUSRevertant_Untreated       0.69      0.68      0.68      1131
           OPTN_Untreated       0.41      0.41      0.41      1103
           TBK1_Untreated       0.39      0.80      0.53      1045
          TDP43_Untreated       0.49      0.34      0.40       930
             WT_Untreated       0.41      0.07      0.12       918

                 accuracy                           0.55      6931
                macro avg       0.58      0.54      0.52      6931
             weighted avg       0.57      0.55      0.52      6931

Train dataset
batches [1, 2, 3, 8, 9, 10]
(52238,) (52238, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Test dataset
batches [7]
(163,) (163, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
FUSRevertant_Untreated: 44
OPTN_Untreated: 5
TBK1_Untreated: 13
TDP43_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.00      0.00      0.00        26
  FUSHomozygous_Untreated       0.00      0.00      0.00        25
   FUSRevertant_Untreated       0.00      0.00      0.00        44
           OPTN_Untreated       0.06      1.00      0.11         5
           TBK1_Untreated       1.00      0.08      0.14        13
          TDP43_Untreated       0.05      0.15      0.08        13
             WT_Untreated       0.76      0.86      0.81        37

                 accuracy                           0.25       163
                macro avg       0.27      0.30      0.16       163
             weighted avg       0.26      0.25      0.21       163

Train dataset
batches [1, 2, 3, 7, 9, 10]
(42851,) (42851, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.81      0.13      0.22      1567
  FUSHomozygous_Untreated       0.53      0.95      0.68      1562
   FUSRevertant_Untreated       0.60      0.21      0.31      1163
           OPTN_Untreated       0.45      0.24      0.32      1429
           TBK1_Untreated       0.72      0.12      0.21       755
          TDP43_Untreated       0.26      0.65      0.37      1564
             WT_Untreated       0.38      0.32      0.35      1510

                 accuracy                           0.41      9550
                macro avg       0.54      0.37      0.35      9550
             weighted avg       0.52      0.41      0.36      9550

Train dataset
batches [1, 2, 3, 7, 8, 10]
(43208,) (43208, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.13      0.16      0.14       963
  FUSHomozygous_Untreated       0.29      0.63      0.40       619
   FUSRevertant_Untreated       0.59      0.03      0.05      1298
           OPTN_Untreated       0.18      0.50      0.26      1586
           TBK1_Untreated       0.30      0.14      0.19       984
          TDP43_Untreated       0.08      0.06      0.07      1439
             WT_Untreated       0.88      0.20      0.32      2304

                 accuracy                           0.22      9193
                macro avg       0.35      0.24      0.21      9193
             weighted avg       0.41      0.22      0.21      9193

Train dataset
batches [1, 2, 3, 7, 8, 9]
(45125,) (45125, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
fit
predict
                           precision    recall  f1-score   support

FUSHeterozygous_Untreated       0.60      0.34      0.43       267
  FUSHomozygous_Untreated       0.78      0.91      0.84       666
   FUSRevertant_Untreated       0.05      0.96      0.10        45
           OPTN_Untreated       0.96      0.04      0.07      1763
           TBK1_Untreated       0.30      0.33      0.31       188
          TDP43_Untreated       0.06      0.00      0.01      2151
             WT_Untreated       0.42      0.98      0.59      2196

                 accuracy                           0.42      7276
                macro avg       0.45      0.51      0.33      7276
             weighted avg       0.48      0.42      0.30      7276

0.4052199180119422 [0.4556348335162825, 0.5391335567257061, 0.55201269658058, 0.24539877300613497, 0.40523560209424087, 0.22364842815185468, 0.41547553600879605]
In [13]:
all_probs = []
batches = [1,2,3,8,9,10]

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
    count_labels(y_test)

    # Encode
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)
    class_names = le.classes_

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_enc = ros.fit_resample(X_train, y_train_enc)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_enc = cudf.Series(y_train_enc)

    probs_per_batch = {}

    for class_index, class_name in enumerate(class_names):
        y_binary = (y_train_enc == class_index).astype(int)
        y_binary = cudf.Series(y_binary)

        clf = cuMLLogisticRegression(verbose=0)
        clf.fit(X_train, y_binary)

        probas = clf.predict_proba(X_test).to_numpy()[:, 1]
        probs_per_batch[class_name] = probas

    df_probs = pd.DataFrame(probs_per_batch)
    df_probs["true_label"] = [class_names[i] for i in y_test_enc]
    df_probs["test_batch"] = test_batch

    all_probs.append(df_probs)

results_df = pd.concat(all_probs, ignore_index=True)
Train dataset
batches [2, 3, 8, 9, 10]
(41306,) (41306, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5032
FUSHomozygous_Untreated: 4809
FUSRevertant_Untreated: 4437
OPTN_Untreated: 7530
TBK1_Untreated: 4192
TDP43_Untreated: 7592
WT_Untreated: 7714
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
Train dataset
batches [1, 3, 8, 9, 10]
(43882,) (43882, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5023
FUSHomozygous_Untreated: 4892
FUSRevertant_Untreated: 4652
OPTN_Untreated: 8195
TBK1_Untreated: 4848
TDP43_Untreated: 7783
WT_Untreated: 8489
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
Train dataset
batches [1, 2, 8, 9, 10]
(45307,) (45307, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5250
FUSHomozygous_Untreated: 5254
FUSRevertant_Untreated: 4321
OPTN_Untreated: 8741
TBK1_Untreated: 5023
TDP43_Untreated: 8361
WT_Untreated: 8357
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
Train dataset
batches [1, 2, 3, 9, 10]
(42688,) (42688, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4687
FUSHomozygous_Untreated: 4492
FUSRevertant_Untreated: 4289
OPTN_Untreated: 8415
TBK1_Untreated: 5313
TDP43_Untreated: 7727
WT_Untreated: 7765
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
Train dataset
batches [1, 2, 3, 8, 10]
(43045,) (43045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5291
FUSHomozygous_Untreated: 5435
FUSRevertant_Untreated: 4154
OPTN_Untreated: 8258
TBK1_Untreated: 5084
TDP43_Untreated: 7852
WT_Untreated: 6971
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
Train dataset
batches [1, 2, 3, 8, 9]
(44962,) (44962, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5987
FUSHomozygous_Untreated: 5388
FUSRevertant_Untreated: 5407
OPTN_Untreated: 8081
TBK1_Untreated: 5880
TDP43_Untreated: 7140
WT_Untreated: 7079
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
In [15]:
# Get the label with max probability per row
results_df["pred_label"] = results_df[class_names].idxmax(axis=1)
In [20]:
cm = confusion_matrix(results_df["true_label"], results_df["pred_label"], labels=class_names)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[l.replace('_Untreated', '') for l in class_names])
disp.plot(xticks_rotation=90)
plt.title("Confusion Matrix from One-vs-Rest Predictions")
plt.tight_layout()
plt.show()
In [29]:
all_probs = []
batches = [1,2,3,8,9,10]

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train), np.shape(X_train), np.unique(y_train))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test), np.shape(X_test), np.unique(y_test))
    count_labels(y_test)

    # Encode
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)
    class_names = le.classes_

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    probs_per_batch = {}
    X_test = cudf.DataFrame.from_records(X_test)

    for class_index, class_name in enumerate(class_names):
        y_binary = (y_train_enc == class_index).astype(int)
        
        # Optional: balance
        balance = True
        if balance:
            ros = RandomOverSampler(random_state=42)
            X_traini, y_binary = ros.fit_resample(X_train, y_binary)
            
        # To GPU
        X_traini = cudf.DataFrame.from_records(X_traini)
        y_binary = cudf.Series(y_binary)

        clf = cuMLLogisticRegression(verbose=0)
        clf.fit(X_traini, y_binary)

        probas = clf.predict_proba(X_test).to_numpy()[:, 1]
        probs_per_batch[class_name] = probas

    df_probs = pd.DataFrame(probs_per_batch)
    df_probs["true_label"] = [class_names[i] for i in y_test_enc]
    df_probs["test_batch"] = test_batch

    all_probs.append(df_probs)

results_df_bal = pd.concat(all_probs, ignore_index=True)
Train dataset
batches [2, 3, 8, 9, 10]
(41306,) (41306, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5032
FUSHomozygous_Untreated: 4809
FUSRevertant_Untreated: 4437
OPTN_Untreated: 7530
TBK1_Untreated: 4192
TDP43_Untreated: 7592
WT_Untreated: 7714
Test dataset
batches [1]
(10932,) (10932, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
FUSRevertant_Untreated: 1015
OPTN_Untreated: 2314
TBK1_Untreated: 1876
TDP43_Untreated: 1699
WT_Untreated: 1561
Train dataset
batches [1, 3, 8, 9, 10]
(43882,) (43882, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5023
FUSHomozygous_Untreated: 4892
FUSRevertant_Untreated: 4652
OPTN_Untreated: 8195
TBK1_Untreated: 4848
TDP43_Untreated: 7783
WT_Untreated: 8489
Test dataset
batches [2]
(8356,) (8356, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
FUSRevertant_Untreated: 800
OPTN_Untreated: 1649
TBK1_Untreated: 1220
TDP43_Untreated: 1508
WT_Untreated: 786
Train dataset
batches [1, 2, 8, 9, 10]
(45307,) (45307, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5250
FUSHomozygous_Untreated: 5254
FUSRevertant_Untreated: 4321
OPTN_Untreated: 8741
TBK1_Untreated: 5023
TDP43_Untreated: 8361
WT_Untreated: 8357
Test dataset
batches [3]
(6931,) (6931, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
FUSRevertant_Untreated: 1131
OPTN_Untreated: 1103
TBK1_Untreated: 1045
TDP43_Untreated: 930
WT_Untreated: 918
Train dataset
batches [1, 2, 3, 9, 10]
(42688,) (42688, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 4687
FUSHomozygous_Untreated: 4492
FUSRevertant_Untreated: 4289
OPTN_Untreated: 8415
TBK1_Untreated: 5313
TDP43_Untreated: 7727
WT_Untreated: 7765
Test dataset
batches [8]
(9550,) (9550, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
FUSRevertant_Untreated: 1163
OPTN_Untreated: 1429
TBK1_Untreated: 755
TDP43_Untreated: 1564
WT_Untreated: 1510
Train dataset
batches [1, 2, 3, 8, 10]
(43045,) (43045, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5291
FUSHomozygous_Untreated: 5435
FUSRevertant_Untreated: 4154
OPTN_Untreated: 8258
TBK1_Untreated: 5084
TDP43_Untreated: 7852
WT_Untreated: 6971
Test dataset
batches [9]
(9193,) (9193, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
FUSRevertant_Untreated: 1298
OPTN_Untreated: 1586
TBK1_Untreated: 984
TDP43_Untreated: 1439
WT_Untreated: 2304
Train dataset
batches [1, 2, 3, 8, 9]
(44962,) (44962, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 5987
FUSHomozygous_Untreated: 5388
FUSRevertant_Untreated: 5407
OPTN_Untreated: 8081
TBK1_Untreated: 5880
TDP43_Untreated: 7140
WT_Untreated: 7079
Test dataset
batches [10]
(7276,) (7276, 5568) ['FUSHeterozygous_Untreated' 'FUSHomozygous_Untreated'
 'FUSRevertant_Untreated' 'OPTN_Untreated' 'TBK1_Untreated'
 'TDP43_Untreated' 'WT_Untreated']
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
FUSRevertant_Untreated: 45
OPTN_Untreated: 1763
TBK1_Untreated: 188
TDP43_Untreated: 2151
WT_Untreated: 2196
In [30]:
# Get the label with max probability per row
results_df_bal["pred_label"] = results_df_bal[class_names].idxmax(axis=1)
In [31]:
cm = confusion_matrix(results_df_bal["true_label"], results_df_bal["pred_label"], labels=class_names)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[l.replace('_Untreated', '') for l in class_names])
disp.plot(xticks_rotation=90)
plt.title("Confusion Matrix from One-vs-Rest Predictions")
plt.tight_layout()
plt.show()
In [ ]:
 
In [33]:
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.38      0.96      0.55      1561
           1       0.97      0.42      0.59      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.70      8218
   macro avg       0.78      0.80      0.71      8218
weighted avg       0.87      0.70      0.71      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.88      0.60      0.71       786
           1       0.89      0.98      0.93      2869
           2       1.00      0.99      1.00      2393

    accuracy                           0.93      6048
   macro avg       0.92      0.86      0.88      6048
weighted avg       0.93      0.93      0.93      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.88      0.20      0.33       918
           1       0.74      0.99      0.85      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.84      4870
   macro avg       0.87      0.73      0.72      4870
weighted avg       0.86      0.84      0.81      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.50      0.97      0.66        37
           1       0.53      1.00      0.69        18
           2       0.00      0.00      0.00        51

    accuracy                           0.51       106
   macro avg       0.34      0.66      0.45       106
weighted avg       0.26      0.51      0.35       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.76      0.95      0.85      1510
           1       0.96      0.80      0.87      2184
           2       1.00      1.00      1.00      3129

    accuracy                           0.92      6823
   macro avg       0.91      0.92      0.91      6823
weighted avg       0.93      0.92      0.92      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.99      0.28      0.43      2304
           1       0.45      0.52      0.48      2570
           2       0.56      1.00      0.72      1582

    accuracy                           0.55      6456
   macro avg       0.67      0.60      0.54      6456
weighted avg       0.67      0.55      0.52      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      2196
           1       0.99      0.32      0.49      1951
           2       1.00      1.00      1.00       933

    accuracy                           0.74      5080
   macro avg       0.87      0.77      0.75      5080
weighted avg       0.83      0.74      0.70      5080

0.7426534988606892 [0.6994402531029448, 0.93369708994709, 0.8429158110882957, 0.5094339622641509, 0.9233475010992233, 0.5509603469640645, 0.7387795275590551]
In [36]:
## Feature selection =>not as good

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    top_features = get_top_features(X_train, y_train_mapped, 100)
    X_train = X_train[:, top_features]
    X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.11      0.21      0.15      1561
           1       0.56      0.39      0.46      4190
           2       1.00      0.99      1.00      2467

    accuracy                           0.54      8218
   macro avg       0.56      0.53      0.53      8218
weighted avg       0.61      0.54      0.56      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.29      0.38      0.33       786
           1       0.80      0.74      0.77      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.79      6048
   macro avg       0.70      0.70      0.70      6048
weighted avg       0.81      0.79      0.80      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.26      0.30      0.28       918
           1       0.67      0.64      0.65      2148
           2       1.00      0.99      0.99      1804

    accuracy                           0.70      4870
   macro avg       0.64      0.64      0.64      4870
weighted avg       0.72      0.70      0.71      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        37
           1       0.18      1.00      0.31        18
           2       1.00      0.14      0.24        51

    accuracy                           0.24       106
   macro avg       0.39      0.38      0.18       106
weighted avg       0.51      0.24      0.17       106

/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.41      0.39      0.40      1510
           1       0.58      0.61      0.60      2184
           2       1.00      0.99      1.00      3129

    accuracy                           0.74      6823
   macro avg       0.66      0.66      0.66      6823
weighted avg       0.74      0.74      0.74      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      2304
           1       0.27      0.34      0.30      2570
           2       0.48      1.00      0.65      1582

    accuracy                           0.38      6456
   macro avg       0.58      0.45      0.32      6456
weighted avg       0.58      0.38      0.28      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.54      0.46      0.50      2196
           1       0.48      0.57      0.52      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.60      5080
   macro avg       0.67      0.67      0.67      5080
weighted avg       0.60      0.60      0.60      5080

0.5682045972728116 [0.5355317595522024, 0.7895171957671958, 0.7026694045174537, 0.2358490566037736, 0.7361864282573648, 0.37964684014869887, 0.5980314960629921]
In [37]:
## Feature selection + bal =>not as good

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = True
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    top_features = get_top_features(X_train, y_train_mapped, 100)
    X_train = X_train[:, top_features]
    X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.19      0.49      0.27      1561
           1       0.52      0.21      0.30      4190
           2       1.00      0.99      1.00      2467

    accuracy                           0.50      8218
   macro avg       0.57      0.56      0.52      8218
weighted avg       0.60      0.50      0.50      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.25      0.68      0.37       786
           1       0.82      0.45      0.58      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.69      6048
   macro avg       0.69      0.70      0.65      6048
weighted avg       0.82      0.69      0.71      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.26      0.57      0.36       918
           1       0.61      0.30      0.41      2148
           2       1.00      0.99      0.99      1804

    accuracy                           0.61      4870
   macro avg       0.62      0.62      0.59      4870
weighted avg       0.69      0.61      0.62      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        37
           1       0.18      1.00      0.31        18
           2       1.00      0.16      0.27        51

    accuracy                           0.25       106
   macro avg       0.39      0.39      0.19       106
weighted avg       0.51      0.25      0.18       106

/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.40      0.62      0.49      1510
           1       0.57      0.37      0.45      2184
           2       1.00      0.99      1.00      3129

    accuracy                           0.71      6823
   macro avg       0.66      0.66      0.64      6823
weighted avg       0.73      0.71      0.71      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.95      0.02      0.03      2304
           1       0.27      0.33      0.30      2570
           2       0.48      1.00      0.65      1582

    accuracy                           0.38      6456
   macro avg       0.57      0.45      0.33      6456
weighted avg       0.57      0.38      0.29      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.55      0.91      0.68      2196
           1       0.58      0.15      0.24      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.63      5080
   macro avg       0.71      0.68      0.64      5080
weighted avg       0.64      0.63      0.57      5080

0.538174871366586 [0.4969578972986128, 0.6899801587301587, 0.609034907597536, 0.24528301886792453, 0.7103913234647515, 0.3828996282527881, 0.6326771653543307]
In [38]:
## Feature selection +bal+norm =>not as good

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = True
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = True
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    top_features = get_top_features(X_train, y_train_mapped, 100)
    X_train = X_train[:, top_features]
    X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.19      0.49      0.27      1561
           1       0.53      0.22      0.31      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.50      8218
   macro avg       0.57      0.57      0.53      8218
weighted avg       0.61      0.50      0.51      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.26      0.59      0.36       786
           1       0.82      0.53      0.65      2869
           2       1.00      1.00      1.00      2393

    accuracy                           0.72      6048
   macro avg       0.69      0.71      0.67      6048
weighted avg       0.82      0.72      0.75      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.28      0.54      0.37       918
           1       0.67      0.40      0.50      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.65      4870
   macro avg       0.65      0.65      0.62      4870
weighted avg       0.72      0.65      0.66      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.03      0.05        37
           1       0.30      0.33      0.32        18
           2       0.48      0.80      0.60        51

    accuracy                           0.45       106
   macro avg       0.59      0.39      0.32       106
weighted avg       0.63      0.45      0.36       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.38      0.60      0.47      1510
           1       0.55      0.34      0.42      2184
           2       1.00      1.00      1.00      3129

    accuracy                           0.70      6823
   macro avg       0.64      0.64      0.63      6823
weighted avg       0.72      0.70      0.70      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.02      0.04      2304
           1       0.31      0.40      0.35      2570
           2       0.50      1.00      0.67      1582

    accuracy                           0.41      6456
   macro avg       0.61      0.47      0.35      6456
weighted avg       0.60      0.41      0.32      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.55      0.88      0.68      2196
           1       0.58      0.19      0.29      1951
           2       1.00      1.00      1.00       933

    accuracy                           0.64      5080
   macro avg       0.71      0.69      0.65      5080
weighted avg       0.64      0.64      0.59      5080

0.5819310102465074 [0.5046239961061085, 0.7230489417989417, 0.6486652977412731, 0.4528301886792453, 0.6980800234500952, 0.40985130111524165, 0.6364173228346457]
In [39]:
## No feature selection +bal =>same

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = True
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
#     top_features = get_top_features(X_train, y_train_mapped, 100)
#     X_train = X_train[:, top_features]
#     X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.37      0.97      0.54      1561
           1       0.98      0.39      0.56      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.68      8218
   macro avg       0.78      0.79      0.70      8218
weighted avg       0.87      0.68      0.69      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.86      0.65      0.74       786
           1       0.90      0.97      0.94      2869
           2       1.00      0.99      1.00      2393

    accuracy                           0.94      6048
   macro avg       0.92      0.87      0.89      6048
weighted avg       0.94      0.94      0.93      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.86      0.24      0.38       918
           1       0.75      0.98      0.85      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.85      4870
   macro avg       0.87      0.74      0.74      4870
weighted avg       0.86      0.85      0.82      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.51      1.00      0.67        37
           1       0.55      1.00      0.71        18
           2       0.00      0.00      0.00        51

    accuracy                           0.52       106
   macro avg       0.35      0.67      0.46       106
weighted avg       0.27      0.52      0.35       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.74      0.96      0.83      1510
           1       0.97      0.76      0.85      2184
           2       1.00      1.00      1.00      3129

    accuracy                           0.92      6823
   macro avg       0.90      0.91      0.90      6823
weighted avg       0.93      0.92      0.92      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.99      0.32      0.49      2304
           1       0.45      0.49      0.47      2570
           2       0.55      1.00      0.71      1582

    accuracy                           0.56      6456
   macro avg       0.66      0.60      0.55      6456
weighted avg       0.67      0.56      0.53      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.61      1.00      0.76      2196
           1       0.99      0.27      0.43      1951
           2       1.00      1.00      1.00       933

    accuracy                           0.72      5080
   macro avg       0.87      0.76      0.73      5080
weighted avg       0.83      0.72      0.67      5080

0.7398777954817167 [0.6832562667315648, 0.9375, 0.848870636550308, 0.5188679245283019, 0.9152865308515316, 0.5560718711276332, 0.7192913385826771]
In [44]:
## add regularization c=0.1 - not improving

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1, c=0.1)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
[I] [16:58:04.002392] Unused keyword parameter: c during cuML estimator initialization
predict
              precision    recall  f1-score   support

           0       0.38      0.96      0.55      1561
           1       0.97      0.42      0.59      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.70      8218
   macro avg       0.78      0.80      0.71      8218
weighted avg       0.87      0.70      0.71      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.88      0.60      0.71       786
           1       0.89      0.98      0.93      2869
           2       1.00      0.99      1.00      2393

    accuracy                           0.93      6048
   macro avg       0.92      0.86      0.88      6048
weighted avg       0.93      0.93      0.93      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.87      0.20      0.32       918
           1       0.74      0.99      0.85      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.84      4870
   macro avg       0.87      0.73      0.72      4870
weighted avg       0.86      0.84      0.80      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.50      0.97      0.66        37
           1       0.53      1.00      0.69        18
           2       0.00      0.00      0.00        51

    accuracy                           0.51       106
   macro avg       0.34      0.66      0.45       106
weighted avg       0.26      0.51      0.35       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.76      0.95      0.85      1510
           1       0.96      0.80      0.87      2184
           2       1.00      1.00      1.00      3129

    accuracy                           0.92      6823
   macro avg       0.91      0.91      0.90      6823
weighted avg       0.93      0.92      0.92      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.99      0.28      0.43      2304
           1       0.45      0.52      0.48      2570
           2       0.56      1.00      0.72      1582

    accuracy                           0.55      6456
   macro avg       0.67      0.60      0.54      6456
weighted avg       0.67      0.55      0.52      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      2196
           1       0.99      0.32      0.49      1951
           2       1.00      1.00      1.00       933

    accuracy                           0.74      5080
   macro avg       0.87      0.77      0.75      5080
weighted avg       0.84      0.74      0.70      5080

0.7425284397307202 [0.6993185689948893, 0.9333664021164021, 0.8425051334702258, 0.5094339622641509, 0.923054374908398, 0.5506505576208178, 0.7393700787401575]
In [45]:
## add regularization c=10

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = cuMLLogisticRegression(verbose=1, c=10)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.38      0.96      0.55      1561
           1       0.97      0.42      0.59      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.70      8218
   macro avg       0.78      0.80      0.71      8218
weighted avg       0.87      0.70      0.70      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.88      0.60      0.71       786
           1       0.89      0.98      0.93      2869
           2       1.00      0.99      1.00      2393

    accuracy                           0.93      6048
   macro avg       0.92      0.86      0.88      6048
weighted avg       0.93      0.93      0.93      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.87      0.20      0.33       918
           1       0.74      0.99      0.85      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.84      4870
   macro avg       0.87      0.73      0.72      4870
weighted avg       0.86      0.84      0.80      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.50      0.97      0.66        37
           1       0.53      1.00      0.69        18
           2       0.00      0.00      0.00        51

    accuracy                           0.51       106
   macro avg       0.34      0.66      0.45       106
weighted avg       0.26      0.51      0.35       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.76      0.95      0.85      1510
           1       0.96      0.80      0.87      2184
           2       1.00      1.00      1.00      3129

    accuracy                           0.92      6823
   macro avg       0.91      0.91      0.90      6823
weighted avg       0.93      0.92      0.92      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.99      0.28      0.43      2304
           1       0.45      0.52      0.48      2570
           2       0.56      1.00      0.72      1582

    accuracy                           0.55      6456
   macro avg       0.66      0.60      0.54      6456
weighted avg       0.67      0.55      0.52      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      2196
           1       0.99      0.32      0.49      1951
           2       1.00      1.00      1.00       933

    accuracy                           0.74      5080
   macro avg       0.87      0.77      0.75      5080
weighted avg       0.83      0.74      0.70      5080

0.7422607518311682 [0.6983450961304454, 0.9340277777777778, 0.8427104722792608, 0.5094339622641509, 0.9232009380038106, 0.5497211895910781, 0.7383858267716535]
In [42]:
## RandomForestClassifier

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = True
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=100, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.44      0.32      0.37      1561
           1       0.77      0.85      0.81      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.79      8218
   macro avg       0.74      0.72      0.73      8218
weighted avg       0.78      0.79      0.78      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.88      0.16      0.28       786
           1       0.80      0.99      0.89      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.88      6048
   macro avg       0.89      0.71      0.72      6048
weighted avg       0.89      0.88      0.85      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.68      0.05      0.09       918
           1       0.71      0.99      0.83      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.82      4870
   macro avg       0.79      0.68      0.64      4870
weighted avg       0.81      0.82      0.75      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.75      0.16      0.27        37
           1       0.23      1.00      0.37        18
           2       0.58      0.22      0.31        51

    accuracy                           0.33       106
   macro avg       0.52      0.46      0.32       106
weighted avg       0.58      0.33      0.31       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.72      0.60      0.66      1510
           1       0.74      0.84      0.79      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.86      6823
   macro avg       0.82      0.81      0.81      6823
weighted avg       0.86      0.86      0.85      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       0.99      0.07      0.14      2304
           1       0.25      0.28      0.27      2570
           2       0.46      1.00      0.63      1582

    accuracy                           0.38      6456
   macro avg       0.57      0.45      0.34      6456
weighted avg       0.57      0.38      0.31      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.60      1.00      0.75      2196
           1       0.97      0.26      0.41      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.71      5080
   macro avg       0.86      0.75      0.72      5080
weighted avg       0.82      0.71      0.67      5080

0.6819123024071383 [0.7937454368459479, 0.8814484126984127, 0.8151950718685832, 0.330188679245283, 0.8563681664956764, 0.3830545229244114, 0.7133858267716535]
In [43]:
## RandomForestClassifier
batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    balance = False
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    norm = False
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=100, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.46      0.21      0.28      1561
           1       0.75      0.91      0.82      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.80      8218
   macro avg       0.74      0.70      0.70      8218
weighted avg       0.77      0.80      0.77      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.92      0.11      0.20       786
           1       0.79      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.88      6048
   macro avg       0.90      0.70      0.69      6048
weighted avg       0.89      0.88      0.84      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.65      0.02      0.04       918
           1       0.70      0.99      0.82      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.78      0.67      0.62      4870
weighted avg       0.80      0.81      0.74      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.80      0.11      0.19        37
           1       0.28      0.94      0.44        18
           2       0.54      0.43      0.48        51

    accuracy                           0.41       106
   macro avg       0.54      0.49      0.37       106
weighted avg       0.59      0.41      0.37       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.78      0.49      0.60      1510
           1       0.71      0.91      0.80      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.85      6823
   macro avg       0.83      0.79      0.80      6823
weighted avg       0.86      0.85      0.84      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.07      0.14      2304
           1       0.26      0.30      0.28      2570
           2       0.47      1.00      0.64      1582

    accuracy                           0.39      6456
   macro avg       0.58      0.46      0.35      6456
weighted avg       0.58      0.39      0.32      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.63      0.99      0.77      2196
           1       0.96      0.35      0.51      1951
           2       1.00      0.99      0.99       933

    accuracy                           0.74      5080
   macro avg       0.86      0.78      0.76      5080
weighted avg       0.83      0.74      0.71      5080

0.6969770713683098 [0.8015332197614992, 0.875165343915344, 0.8123203285420945, 0.4056603773584906, 0.8505056426791734, 0.3895600991325898, 0.7440944881889764]
In [46]:
## RandomForestClassifier
balance = False
norm = False
choose_features=True

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=100, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.08      0.15      0.11      1561
           1       0.54      0.38      0.45      4190
           2       1.00      0.99      1.00      2467

    accuracy                           0.52      8218
   macro avg       0.54      0.51      0.52      8218
weighted avg       0.59      0.52      0.55      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.26      0.36      0.30       786
           1       0.79      0.72      0.75      2869
           2       1.00      0.99      0.99      2393

    accuracy                           0.78      6048
   macro avg       0.68      0.69      0.68      6048
weighted avg       0.81      0.78      0.79      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.26      0.30      0.28       918
           1       0.67      0.63      0.65      2148
           2       1.00      0.99      1.00      1804

    accuracy                           0.70      4870
   macro avg       0.64      0.64      0.64      4870
weighted avg       0.72      0.70      0.71      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        37
           1       0.20      0.61      0.30        18
           2       0.56      0.55      0.55        51

    accuracy                           0.37       106
   macro avg       0.25      0.39      0.28       106
weighted avg       0.30      0.37      0.32       106

/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.37      0.36      0.36      1510
           1       0.56      0.57      0.56      2184
           2       1.00      0.99      1.00      3129

    accuracy                           0.72      6823
   macro avg       0.64      0.64      0.64      6823
weighted avg       0.72      0.72      0.72      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      2304
           1       0.25      0.29      0.27      2570
           2       0.47      1.00      0.64      1582

    accuracy                           0.36      6456
   macro avg       0.57      0.43      0.30      6456
weighted avg       0.57      0.36      0.26      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.51      0.45      0.48      2196
           1       0.45      0.51      0.48      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.57      5080
   macro avg       0.65      0.65      0.65      5080
weighted avg       0.58      0.57      0.57      5080

0.5743842090556159 [0.520929666585544, 0.7782738095238095, 0.7012320328542094, 0.36792452830188677, 0.7162538472812546, 0.362453531598513, 0.5736220472440945]
In [47]:
## RandomForestClassifier
balance = False
norm = False
choose_features=False

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=50, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.43      0.22      0.29      1561
           1       0.75      0.89      0.81      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.79      8218
   macro avg       0.73      0.70      0.70      8218
weighted avg       0.77      0.79      0.77      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.92      0.13      0.23       786
           1       0.80      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.88      6048
   macro avg       0.90      0.70      0.70      6048
weighted avg       0.89      0.88      0.84      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.70      0.04      0.08       918
           1       0.71      0.99      0.83      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.80      0.68      0.63      4870
weighted avg       0.81      0.81      0.75      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.33      0.03      0.05        37
           1       0.23      0.78      0.35        18
           2       0.41      0.33      0.37        51

    accuracy                           0.30       106
   macro avg       0.32      0.38      0.26       106
weighted avg       0.35      0.30      0.25       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.72      0.50      0.59      1510
           1       0.70      0.87      0.78      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.84      6823
   macro avg       0.81      0.78      0.79      6823
weighted avg       0.84      0.84      0.84      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.10      0.18      2304
           1       0.27      0.30      0.28      2570
           2       0.46      1.00      0.63      1582

    accuracy                           0.40      6456
   macro avg       0.58      0.46      0.36      6456
weighted avg       0.58      0.40      0.33      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.63      0.99      0.77      2196
           1       0.95      0.35      0.51      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.74      5080
   macro avg       0.86      0.78      0.76      5080
weighted avg       0.82      0.74      0.71      5080

0.6812646448819758 [0.7942321732781699, 0.8771494708994709, 0.8149897330595482, 0.3018867924528302, 0.8406859152865308, 0.39699504337050806, 0.7429133858267717]
In [48]:
## RandomForestClassifier n_estimators=50
balance = False
norm = False
choose_features=False

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=50, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.43      0.22      0.29      1561
           1       0.75      0.89      0.81      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.79      8218
   macro avg       0.73      0.70      0.70      8218
weighted avg       0.77      0.79      0.77      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.92      0.13      0.23       786
           1       0.80      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.88      6048
   macro avg       0.90      0.70      0.70      6048
weighted avg       0.89      0.88      0.84      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.70      0.04      0.08       918
           1       0.71      0.99      0.83      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.80      0.68      0.63      4870
weighted avg       0.81      0.81      0.75      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.33      0.03      0.05        37
           1       0.23      0.78      0.35        18
           2       0.41      0.33      0.37        51

    accuracy                           0.30       106
   macro avg       0.32      0.38      0.26       106
weighted avg       0.35      0.30      0.25       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.72      0.50      0.59      1510
           1       0.70      0.87      0.78      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.84      6823
   macro avg       0.81      0.78      0.79      6823
weighted avg       0.84      0.84      0.84      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.10      0.18      2304
           1       0.27      0.30      0.28      2570
           2       0.46      1.00      0.63      1582

    accuracy                           0.40      6456
   macro avg       0.58      0.46      0.36      6456
weighted avg       0.58      0.40      0.33      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.63      0.99      0.77      2196
           1       0.95      0.35      0.51      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.74      5080
   macro avg       0.86      0.78      0.76      5080
weighted avg       0.82      0.74      0.71      5080

0.6812646448819758 [0.7942321732781699, 0.8771494708994709, 0.8149897330595482, 0.3018867924528302, 0.8406859152865308, 0.39699504337050806, 0.7429133858267717]
In [49]:
## RandomForestClassifier n_estimators=200
balance = False
norm = False
choose_features=False

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=200, max_depth=20)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.46      0.19      0.27      1561
           1       0.75      0.92      0.83      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.80      8218
   macro avg       0.74      0.70      0.70      8218
weighted avg       0.77      0.80      0.77      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.95      0.10      0.18       786
           1       0.79      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.87      6048
   macro avg       0.91      0.69      0.69      6048
weighted avg       0.89      0.87      0.83      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.82      0.02      0.04       918
           1       0.70      1.00      0.83      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.84      0.67      0.62      4870
weighted avg       0.83      0.81      0.74      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.67      0.05      0.10        37
           1       0.26      0.89      0.40        18
           2       0.56      0.45      0.50        51

    accuracy                           0.39       106
   macro avg       0.50      0.46      0.33       106
weighted avg       0.55      0.39      0.34       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.82      0.48      0.60      1510
           1       0.71      0.93      0.80      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.86      6823
   macro avg       0.84      0.80      0.80      6823
weighted avg       0.87      0.86      0.85      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.06      0.12      2304
           1       0.26      0.29      0.28      2570
           2       0.47      1.00      0.63      1582

    accuracy                           0.38      6456
   macro avg       0.57      0.45      0.34      6456
weighted avg       0.57      0.38      0.31      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.63      1.00      0.77      2196
           1       0.98      0.33      0.50      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.74      5080
   macro avg       0.87      0.77      0.75      5080
weighted avg       0.83      0.74      0.71      5080

0.6941267651147333 [0.802750060842054, 0.874834656084656, 0.8135523613963039, 0.3867924528301887, 0.8556353510186135, 0.3839838909541512, 0.7413385826771653]
In [50]:
## RandomForestClassifier max_depth=10
balance = False
norm = False
choose_features=False

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=100, max_depth=10)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.44      0.14      0.22      1561
           1       0.74      0.93      0.83      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.80      8218
   macro avg       0.73      0.69      0.68      8218
weighted avg       0.76      0.80      0.76      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.04      0.07       786
           1       0.78      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.86      6048
   macro avg       0.93      0.67      0.64      6048
weighted avg       0.89      0.86      0.82      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.67      0.00      0.01       918
           1       0.70      1.00      0.82      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.79      0.67      0.61      4870
weighted avg       0.80      0.81      0.73      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.83      0.14      0.23        37
           1       0.25      0.94      0.40        18
           2       0.55      0.35      0.43        51

    accuracy                           0.38       106
   macro avg       0.54      0.48      0.35       106
weighted avg       0.60      0.38      0.36       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.80      0.44      0.57      1510
           1       0.70      0.93      0.80      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.85      6823
   macro avg       0.83      0.79      0.79      6823
weighted avg       0.86      0.85      0.84      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.05      0.10      2304
           1       0.26      0.30      0.28      2570
           2       0.47      1.00      0.64      1582

    accuracy                           0.38      6456
   macro avg       0.58      0.45      0.34      6456
weighted avg       0.58      0.38      0.30      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      2196
           1       0.96      0.37      0.53      1951
           2       1.00      0.99      1.00       933

    accuracy                           0.75      5080
   macro avg       0.87      0.78      0.77      5080
weighted avg       0.83      0.75      0.72      5080

0.6911670464646049 [0.8016549038695546, 0.8649140211640212, 0.8108829568788501, 0.37735849056603776, 0.8475743807709218, 0.38460346964064435, 0.7511811023622047]
In [51]:
## RandomForestClassifier max_depth=50
balance = False
norm = False
choose_features=False

batches = [1,2,3,7,8,9,10]
accuracies = []
accumulated_cm = None

# Label mapping
label_map = {
    'WT_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}
excluded_labels = {'FUSRevertant_Untreated', 'TDP43_Untreated'}

for test_batch in batches:
    test_batches = [test_batch]
    train_batches = list(set(batches) - set(test_batches))

    X_train, y_train = load_batches(train_batches)
    X_test, y_test = load_batches(test_batches)

    # Filter out unwanted labels
    train_mask = ~np.isin(y_train, list(excluded_labels))
    test_mask = ~np.isin(y_test, list(excluded_labels))
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test = X_test[test_mask], y_test[test_mask]

    # Map labels to 0,1,2
    y_train_mapped = np.array([label_map[l] for l in y_train])
    y_test_mapped = np.array([label_map[l] for l in y_test])

    print('Train dataset')
    print('batches', train_batches)
    print(np.shape(y_train_mapped), np.shape(X_train), np.unique(y_train_mapped))
    count_labels(y_train)
    print('Test dataset')
    print('batches', test_batches)
    print(np.shape(y_test_mapped), np.shape(X_test), np.unique(y_test_mapped))
    count_labels(y_test)

    # Optional: balance
    if balance:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train_mapped = ros.fit_resample(X_train, y_train_mapped)

    # Optional: normalize
    if norm:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
    if choose_features:
        top_features = get_top_features(X_train, y_train_mapped, 100)
        X_train = X_train[:, top_features]
        X_test = X_test[:, top_features]

    # To GPU
    X_train = cudf.DataFrame.from_records(X_train)
    X_test = cudf.DataFrame.from_records(X_test)
    y_train_mapped = cudf.Series(y_train_mapped)

    # Train
    print('fit')
    clf = RandomForestClassifier(n_estimators=100, max_depth=50)
    clf.fit(X_train, y_train_mapped)

    # Predict
    print('predict')
    y_pred = clf.predict(X_test).to_numpy()
    report = classification_report(y_test_mapped, y_pred, output_dict=True)
    print(classification_report(y_test_mapped, y_pred))    
    accuracy = report['accuracy']
    accuracies.append(accuracy)

    # Confusion matrix
    cm = confusion_matrix(y_test_mapped, y_pred, labels=[0, 1, 2])
    if accumulated_cm is None:
        accumulated_cm = cm
    else:
        accumulated_cm += cm

# Final summary
print(np.mean(accuracies), accuracies)
disp = ConfusionMatrixDisplay(confusion_matrix=accumulated_cm, display_labels=["WT", "TBK1/OPTN", "FUS"])
disp.plot(xticks_rotation=0)
plt.title("Combined Confusion Matrix Across Batches")
plt.tight_layout()
plt.show()
Train dataset
batches [2, 3, 7, 8, 9, 10]
(29383,) (29383, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
OPTN_Untreated: 7535
TBK1_Untreated: 4205
WT_Untreated: 7751
Test dataset
batches [1]
(8218,) (8218, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1222
FUSHomozygous_Untreated: 1245
OPTN_Untreated: 2314
TBK1_Untreated: 1876
WT_Untreated: 1561
fit
predict
              precision    recall  f1-score   support

           0       0.46      0.22      0.30      1561
           1       0.76      0.90      0.82      4190
           2       1.00      1.00      1.00      2467

    accuracy                           0.80      8218
   macro avg       0.74      0.71      0.71      8218
weighted avg       0.77      0.80      0.78      8218

Train dataset
batches [1, 3, 7, 8, 9, 10]
(31553,) (31553, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
OPTN_Untreated: 8200
TBK1_Untreated: 4861
WT_Untreated: 8526
Test dataset
batches [2]
(6048,) (6048, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1231
FUSHomozygous_Untreated: 1162
OPTN_Untreated: 1649
TBK1_Untreated: 1220
WT_Untreated: 786
fit
predict
              precision    recall  f1-score   support

           0       0.91      0.12      0.21       786
           1       0.79      1.00      0.88      2869
           2       1.00      0.98      0.99      2393

    accuracy                           0.88      6048
   macro avg       0.90      0.70      0.70      6048
weighted avg       0.89      0.88      0.84      6048

Train dataset
batches [1, 2, 7, 8, 9, 10]
(32731,) (32731, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
OPTN_Untreated: 8746
TBK1_Untreated: 5036
WT_Untreated: 8394
Test dataset
batches [3]
(4870,) (4870, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1004
FUSHomozygous_Untreated: 800
OPTN_Untreated: 1103
TBK1_Untreated: 1045
WT_Untreated: 918
fit
predict
              precision    recall  f1-score   support

           0       0.70      0.03      0.05       918
           1       0.70      0.99      0.82      2148
           2       1.00      1.00      1.00      1804

    accuracy                           0.81      4870
   macro avg       0.80      0.67      0.62      4870
weighted avg       0.81      0.81      0.74      4870

Train dataset
batches [1, 2, 3, 8, 9, 10]
(37495,) (37495, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
OPTN_Untreated: 9844
TBK1_Untreated: 6068
WT_Untreated: 9275
Test dataset
batches [7]
(106,) (106, 5568) [0 1 2]
FUSHeterozygous_Untreated: 26
FUSHomozygous_Untreated: 25
OPTN_Untreated: 5
TBK1_Untreated: 13
WT_Untreated: 37
fit
predict
              precision    recall  f1-score   support

           0       0.80      0.11      0.19        37
           1       0.25      0.89      0.40        18
           2       0.55      0.41      0.47        51

    accuracy                           0.39       106
   macro avg       0.54      0.47      0.35       106
weighted avg       0.59      0.39      0.36       106

Train dataset
batches [1, 2, 3, 7, 9, 10]
(30778,) (30778, 5568) [0 1 2]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
OPTN_Untreated: 8420
TBK1_Untreated: 5326
WT_Untreated: 7802
Test dataset
batches [8]
(6823,) (6823, 5568) [0 1 2]
FUSHeterozygous_Untreated: 1567
FUSHomozygous_Untreated: 1562
OPTN_Untreated: 1429
TBK1_Untreated: 755
WT_Untreated: 1510
fit
predict
              precision    recall  f1-score   support

           0       0.76      0.51      0.61      1510
           1       0.71      0.89      0.79      2184
           2       1.00      0.99      0.99      3129

    accuracy                           0.85      6823
   macro avg       0.83      0.79      0.80      6823
weighted avg       0.86      0.85      0.84      6823

Train dataset
batches [1, 2, 3, 7, 8, 10]
(31145,) (31145, 5568) [0 1 2]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
OPTN_Untreated: 8263
TBK1_Untreated: 5097
WT_Untreated: 7008
Test dataset
batches [9]
(6456,) (6456, 5568) [0 1 2]
FUSHeterozygous_Untreated: 963
FUSHomozygous_Untreated: 619
OPTN_Untreated: 1586
TBK1_Untreated: 984
WT_Untreated: 2304
fit
predict
              precision    recall  f1-score   support

           0       1.00      0.08      0.15      2304
           1       0.27      0.30      0.28      2570
           2       0.47      1.00      0.64      1582

    accuracy                           0.39      6456
   macro avg       0.58      0.46      0.36      6456
weighted avg       0.58      0.39      0.32      6456

Train dataset
batches [1, 2, 3, 7, 8, 9]
(32521,) (32521, 5568) [0 1 2]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
OPTN_Untreated: 8086
TBK1_Untreated: 5893
WT_Untreated: 7116
Test dataset
batches [10]
(5080,) (5080, 5568) [0 1 2]
FUSHeterozygous_Untreated: 267
FUSHomozygous_Untreated: 666
OPTN_Untreated: 1763
TBK1_Untreated: 188
WT_Untreated: 2196
fit
predict
              precision    recall  f1-score   support

           0       0.62      0.99      0.77      2196
           1       0.97      0.33      0.49      1951
           2       1.00      0.99      0.99       933

    accuracy                           0.74      5080
   macro avg       0.86      0.77      0.75      5080
weighted avg       0.82      0.74      0.70      5080

0.6940960873564158 [0.8012898515453881, 0.8763227513227513, 0.813141683778234, 0.3867924528301887, 0.8500659533929357, 0.3940520446096654, 0.7370078740157481]
In [70]:
 
In [92]:
## Baseline
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.66      0.92      0.77      1222
           1       0.87      0.53      0.66      1245
           2       0.81      0.91      0.86      1015
           3       0.79      0.51      0.62      2314
           4       0.12      0.00      0.00      1876
           5       0.28      0.31      0.29      1699
           6       0.28      0.71      0.40      1561

    accuracy                           0.50     10932
   macro avg       0.54      0.55      0.51     10932
weighted avg       0.52      0.50      0.47     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.63      0.64      0.63      1231
           1       0.61      0.58      0.60      1162
           2       0.47      0.99      0.64       800
           3       0.57      0.65      0.61      1649
           4       0.64      0.61      0.62      1220
           5       0.38      0.15      0.21      1508
           6       0.61      0.55      0.58       786

    accuracy                           0.57      8356
   macro avg       0.56      0.60      0.56      8356
weighted avg       0.56      0.57      0.54      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.74      0.97      0.84      1004
           1       0.94      0.57      0.71       800
           2       0.70      0.74      0.72      1131
           3       0.47      0.40      0.43      1103
           4       0.42      0.82      0.56      1045
           5       0.49      0.38      0.43       930
           6       0.57      0.17      0.26       918

    accuracy                           0.59      6931
   macro avg       0.62      0.58      0.56      6931
weighted avg       0.61      0.59      0.57      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.06      0.80      0.12         5
           4       0.50      0.08      0.13        13
           5       0.07      0.23      0.11        13
           6       0.61      0.97      0.75        37

    accuracy                           0.27       163
   macro avg       0.18      0.30      0.16       163
weighted avg       0.19      0.27      0.19       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.81      0.19      0.31      1567
           1       0.54      0.95      0.69      1562
           2       0.65      0.44      0.53      1163
           3       0.43      0.24      0.31      1429
           4       0.74      0.14      0.23       755
           5       0.29      0.64      0.40      1564
           6       0.43      0.36      0.39      1510

    accuracy                           0.45      9550
   macro avg       0.56      0.42      0.41      9550
weighted avg       0.54      0.45      0.42      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.19      0.27      0.22       963
           1       0.28      0.54      0.37       619
           2       0.78      0.11      0.19      1298
           3       0.18      0.43      0.26      1586
           4       0.34      0.23      0.28       984
           5       0.10      0.09      0.09      1439
           6       0.94      0.26      0.41      2304

    accuracy                           0.26      9193
   macro avg       0.40      0.28      0.26      9193
weighted avg       0.47      0.26      0.27      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.58      0.21      0.31       267
           1       0.75      0.94      0.84       666
           2       0.04      0.87      0.08        45
           3       0.96      0.03      0.05      1763
           4       0.28      0.45      0.35       188
           5       0.09      0.01      0.01      2151
           6       0.43      0.98      0.60      2196

    accuracy                           0.42      7276
   macro avg       0.45      0.50      0.32      7276
weighted avg       0.49      0.42      0.30      7276


=== Overall Accuracy ===
0.435781782307196 [0.5038419319429198, 0.5652225945428435, 0.5863511758764969, 0.26993865030674846, 0.4496335078534031, 0.25987164146633307, 0.4156129741616273]
In [74]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 100, "max_depth": 20},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.53      0.83      0.64      1222
           1       0.62      0.27      0.37      1245
           2       0.83      0.18      0.30      1015
           3       0.37      0.72      0.49      2314
           4       0.07      0.00      0.00      1876
           5       0.29      0.35      0.32      1699
           6       0.11      0.12      0.11      1561

    accuracy                           0.37     10932
   macro avg       0.40      0.35      0.32     10932
weighted avg       0.36      0.37      0.31     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.54      0.86      0.66      1231
           1       0.61      0.21      0.31      1162
           2       0.31      0.93      0.47       800
           3       0.43      0.56      0.49      1649
           4       0.48      0.15      0.23      1220
           5       0.30      0.10      0.15      1508
           6       0.47      0.33      0.39       786

    accuracy                           0.43      8356
   macro avg       0.45      0.45      0.39      8356
weighted avg       0.45      0.43      0.38      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.60      0.96      0.73      1004
           1       0.79      0.19      0.30       800
           2       0.73      0.18      0.29      1131
           3       0.19      0.15      0.17      1103
           4       0.38      0.72      0.49      1045
           5       0.30      0.40      0.34       930
           6       0.19      0.15      0.17       918

    accuracy                           0.40      6931
   macro avg       0.45      0.39      0.36      6931
weighted avg       0.45      0.40      0.36      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.15      0.23      0.18        26
           1       0.31      0.16      0.21        25
           2       0.00      0.00      0.00        44
           3       0.04      0.80      0.08         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.67      0.16      0.26        37

    accuracy                           0.12       163
   macro avg       0.17      0.19      0.10       163
weighted avg       0.22      0.12      0.12       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.91      0.07      0.14      1567
           1       0.52      0.98      0.68      1562
           2       0.17      0.08      0.11      1163
           3       0.24      0.21      0.22      1429
           4       0.32      0.01      0.02       755
           5       0.20      0.39      0.27      1564
           6       0.22      0.24      0.23      1510

    accuracy                           0.32      9550
   macro avg       0.37      0.28      0.24      9550
weighted avg       0.38      0.32      0.26      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.15      0.20      0.17       963
           1       0.21      0.86      0.34       619
           2       0.00      0.00      0.00      1298
           3       0.00      0.00      0.00      1586
           4       0.02      0.00      0.00       984
           5       0.04      0.04      0.04      1439
           6       0.69      0.13      0.22      2304

    accuracy                           0.12      9193
   macro avg       0.16      0.18      0.11      9193
weighted avg       0.21      0.12      0.10      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.23      0.11      0.15       267
           1       0.71      0.85      0.78       666
           2       0.04      0.67      0.08        45
           3       0.57      0.01      0.02      1763
           4       0.21      0.02      0.04       188
           5       0.12      0.01      0.02      2151
           6       0.39      0.96      0.55      2196

    accuracy                           0.38      7276
   macro avg       0.33      0.38      0.24      7276
weighted avg       0.37      0.38      0.26      7276


=== Overall Accuracy ===
0.30419986925510606 [0.36525795828759605, 0.4269985639061752, 0.3970567017746357, 0.12269938650306748, 0.3169633507853403, 0.11889481126944414, 0.38152831225948325]
In [75]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 100, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.53      0.83      0.64      1222
           1       0.62      0.27      0.37      1245
           2       0.83      0.18      0.30      1015
           3       0.37      0.72      0.49      2314
           4       0.07      0.00      0.00      1876
           5       0.29      0.35      0.32      1699
           6       0.11      0.12      0.11      1561

    accuracy                           0.37     10932
   macro avg       0.40      0.35      0.32     10932
weighted avg       0.36      0.37      0.31     10932


=== Overall Accuracy ===
0.36525795828759605 [0.36525795828759605]
In [76]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 50, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.52      0.76      0.62      1222
           1       0.57      0.30      0.40      1245
           2       0.68      0.20      0.31      1015
           3       0.38      0.70      0.49      2314
           4       0.08      0.00      0.01      1876
           5       0.28      0.33      0.30      1699
           6       0.14      0.16      0.15      1561

    accuracy                           0.36     10932
   macro avg       0.38      0.35      0.32     10932
weighted avg       0.34      0.36      0.32     10932


=== Overall Accuracy ===
0.35949506037321627 [0.35949506037321627]
In [77]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 200, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.54      0.86      0.66      1222
           1       0.66      0.26      0.38      1245
           2       0.90      0.19      0.32      1015
           3       0.38      0.75      0.51      2314
           4       0.10      0.00      0.00      1876
           5       0.30      0.37      0.33      1699
           6       0.09      0.10      0.10      1561

    accuracy                           0.37     10932
   macro avg       0.42      0.36      0.33     10932
weighted avg       0.38      0.37      0.32     10932


=== Overall Accuracy ===
0.3739480424442005 [0.3739480424442005]
In [78]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=True,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 50, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.53      0.79      0.63      1222
           1       0.60      0.30      0.40      1245
           2       0.58      0.32      0.42      1015
           3       0.37      0.67      0.48      2314
           4       0.16      0.01      0.03      1876
           5       0.30      0.33      0.31      1699
           6       0.13      0.13      0.13      1561

    accuracy                           0.37     10932
   macro avg       0.38      0.37      0.34     10932
weighted avg       0.35      0.37      0.33     10932


=== Overall Accuracy ===
0.36717892425905596 [0.36717892425905596]
In [79]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 30, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.51      0.71      0.60      1222
           1       0.55      0.33      0.41      1245
           2       0.49      0.20      0.28      1015
           3       0.36      0.66      0.47      2314
           4       0.11      0.01      0.01      1876
           5       0.27      0.29      0.28      1699
           6       0.15      0.18      0.16      1561

    accuracy                           0.35     10932
   macro avg       0.35      0.34      0.32     10932
weighted avg       0.33      0.35      0.31     10932


=== Overall Accuracy ===
0.34824368825466523 [0.34824368825466523]
In [80]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 20, "max_depth": 20},
    test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.51      0.70      0.59      1222
           1       0.54      0.35      0.42      1245
           2       0.43      0.22      0.29      1015
           3       0.36      0.60      0.45      2314
           4       0.15      0.02      0.03      1876
           5       0.27      0.29      0.28      1699
           6       0.16      0.22      0.19      1561

    accuracy                           0.34     10932
   macro avg       0.35      0.34      0.32     10932
weighted avg       0.33      0.34      0.31     10932


=== Overall Accuracy ===
0.34357848518111966 [0.34357848518111966]
In [91]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuRandomForestClassifier,
    classifier_kwargs={"n_estimators": 20, "max_depth": 15},
#     test_specific_batch=[1]
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.52      0.70      0.59      1222
           1       0.55      0.35      0.43      1245
           2       0.50      0.19      0.28      1015
           3       0.35      0.61      0.45      2314
           4       0.13      0.01      0.02      1876
           5       0.28      0.30      0.29      1699
           6       0.14      0.19      0.16      1561

    accuracy                           0.34     10932
   macro avg       0.35      0.34      0.32     10932
weighted avg       0.33      0.34      0.31     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.53      0.73      0.62      1231
           1       0.53      0.31      0.39      1162
           2       0.32      0.86      0.46       800
           3       0.42      0.49      0.45      1649
           4       0.38      0.18      0.24      1220
           5       0.35      0.14      0.20      1508
           6       0.37      0.34      0.36       786

    accuracy                           0.41      8356
   macro avg       0.42      0.44      0.39      8356
weighted avg       0.42      0.41      0.38      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.60      0.90      0.72      1004
           1       0.69      0.27      0.38       800
           2       0.48      0.19      0.27      1131
           3       0.19      0.16      0.17      1103
           4       0.34      0.55      0.42      1045
           5       0.27      0.38      0.32       930
           6       0.20      0.17      0.18       918

    accuracy                           0.37      6931
   macro avg       0.40      0.37      0.35      6931
weighted avg       0.39      0.37      0.35      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.12      0.23      0.16        26
           1       0.15      0.24      0.18        25
           2       0.00      0.00      0.00        44
           3       0.03      0.40      0.06         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.50      0.11      0.18        37

    accuracy                           0.11       163
   macro avg       0.12      0.14      0.08       163
weighted avg       0.16      0.11      0.10       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.70      0.11      0.19      1567
           1       0.52      0.93      0.66      1562
           2       0.19      0.09      0.13      1163
           3       0.23      0.22      0.23      1429
           4       0.22      0.03      0.05       755
           5       0.22      0.38      0.28      1564
           6       0.25      0.29      0.26      1510

    accuracy                           0.33      9550
   macro avg       0.33      0.29      0.26      9550
weighted avg       0.35      0.33      0.28      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.19      0.26      0.22       963
           1       0.20      0.81      0.32       619
           2       0.29      0.00      0.01      1298
           3       0.01      0.02      0.01      1586
           4       0.11      0.04      0.06       984
           5       0.12      0.13      0.12      1439
           6       0.62      0.19      0.29      2304

    accuracy                           0.16      9193
   macro avg       0.22      0.21      0.15      9193
weighted avg       0.26      0.16      0.15      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.32      0.22      0.26       267
           1       0.73      0.81      0.77       666
           2       0.03      0.56      0.06        45
           3       0.55      0.03      0.05      1763
           4       0.11      0.05      0.07       188
           5       0.17      0.04      0.06      2151
           6       0.39      0.87      0.54      2196

    accuracy                           0.37      7276
   macro avg       0.33      0.37      0.26      7276
weighted avg       0.38      0.37      0.27      7276


=== Overall Accuracy ===
0.2980805825298372 [0.3403768752286864, 0.41359502154140737, 0.37209637858894823, 0.11042944785276074, 0.3255497382198953, 0.15838137713477646, 0.36613523914238594]
In [19]:
from cuml.svm import SVC as cuSVC
run_baseline_model(
    classifier_class=cuSVC,
    classifier_kwargs={"kernel": "linear", "C": 1.0, "gamma": "scale"},
    norm=True,  # important for SVMs
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.64      0.89      0.74      1222
           1       0.83      0.50      0.63      1245
           2       0.83      0.91      0.86      1015
           3       0.78      0.54      0.64      2314
           4       0.03      0.00      0.00      1876
           5       0.26      0.35      0.30      1699
           6       0.31      0.69      0.43      1561

    accuracy                           0.51     10932
   macro avg       0.52      0.55      0.52     10932
weighted avg       0.50      0.51      0.48     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.59      0.59      0.59      1231
           1       0.57      0.55      0.56      1162
           2       0.56      0.99      0.72       800
           3       0.49      0.70      0.57      1649
           4       0.59      0.62      0.61      1220
           5       0.30      0.11      0.16      1508
           6       0.78      0.42      0.55       786

    accuracy                           0.55      8356
   macro avg       0.55      0.57      0.54      8356
weighted avg       0.53      0.55      0.52      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.73      0.95      0.83      1004
           1       0.91      0.55      0.69       800
           2       0.72      0.67      0.70      1131
           3       0.43      0.47      0.45      1103
           4       0.43      0.76      0.55      1045
           5       0.46      0.35      0.40       930
           6       0.54      0.18      0.27       918

    accuracy                           0.57      6931
   macro avg       0.60      0.56      0.55      6931
weighted avg       0.59      0.57      0.56      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      0.60      0.10         5
           4       0.50      0.15      0.24        13
           5       0.05      0.23      0.08        13
           6       0.72      0.78      0.75        37

    accuracy                           0.23       163
   macro avg       0.19      0.25      0.17       163
weighted avg       0.21      0.23      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.77      0.24      0.36      1567
           1       0.55      0.93      0.69      1562
           2       0.61      0.57      0.59      1163
           3       0.44      0.28      0.34      1429
           4       0.72      0.15      0.24       755
           5       0.31      0.62      0.41      1564
           6       0.47      0.35      0.40      1510

    accuracy                           0.47      9550
   macro avg       0.55      0.45      0.43      9550
weighted avg       0.54      0.47      0.45      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.25      0.37      0.30       963
           1       0.26      0.47      0.33       619
           2       0.74      0.07      0.13      1298
           3       0.18      0.42      0.25      1586
           4       0.31      0.19      0.23       984
           5       0.09      0.11      0.10      1439
           6       0.95      0.20      0.34      2304

    accuracy                           0.24      9193
   macro avg       0.40      0.26      0.24      9193
weighted avg       0.46      0.24      0.24      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.44      0.18      0.26       267
           1       0.74      0.91      0.81       666
           2       0.03      0.82      0.06        45
           3       0.96      0.02      0.03      1763
           4       0.26      0.57      0.36       188
           5       0.10      0.01      0.02      2151
           6       0.47      0.97      0.64      2196

    accuracy                           0.41      7276
   macro avg       0.43      0.50      0.31      7276
weighted avg       0.50      0.41      0.30      7276


=== Overall Accuracy ===
0.4245612213582959 [0.508781558726674, 0.5453566299664911, 0.5719232433992208, 0.22699386503067484, 0.4700523560209424, 0.23953007723267702, 0.40929081913139087]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.895746     0.565127     0.940765 0.565037 0.940785
  FUSHomozygous_Untreated  0.906166     0.667051     0.937546 0.583621 0.955470
   FUSRevertant_Untreated  0.905803     0.594432     0.942288 0.546870 0.951989
           OPTN_Untreated  0.777294     0.407656     0.862850 0.407573 0.862891
           TBK1_Untreated  0.875957     0.321658     0.948726 0.451628 0.914188
          TDP43_Untreated  0.741226     0.239359     0.849572 0.255683 0.838022
             WT_Untreated  0.805767     0.506873     0.870361 0.457986 0.890913
            Macro Average  0.843994     0.471737     0.907444 0.466914 0.907751
In [22]:
from cuml.svm import SVC as cuSVC
run_baseline_model(
    classifier_class=cuSVC,
    classifier_kwargs={"kernel": "linear", "C": 1.0},
    norm=True,  # important for SVMs
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.64      0.89      0.74      1222
           1       0.83      0.50      0.63      1245
           2       0.83      0.91      0.86      1015
           3       0.78      0.54      0.64      2314
           4       0.03      0.00      0.00      1876
           5       0.26      0.35      0.30      1699
           6       0.31      0.69      0.43      1561

    accuracy                           0.51     10932
   macro avg       0.52      0.55      0.52     10932
weighted avg       0.50      0.51      0.48     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.59      0.59      0.59      1231
           1       0.57      0.55      0.56      1162
           2       0.56      0.99      0.72       800
           3       0.49      0.70      0.57      1649
           4       0.59      0.62      0.61      1220
           5       0.30      0.11      0.16      1508
           6       0.78      0.42      0.55       786

    accuracy                           0.55      8356
   macro avg       0.55      0.57      0.54      8356
weighted avg       0.53      0.55      0.52      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.73      0.95      0.83      1004
           1       0.91      0.55      0.69       800
           2       0.72      0.67      0.70      1131
           3       0.43      0.47      0.45      1103
           4       0.43      0.76      0.55      1045
           5       0.46      0.35      0.40       930
           6       0.54      0.18      0.27       918

    accuracy                           0.57      6931
   macro avg       0.60      0.56      0.55      6931
weighted avg       0.59      0.57      0.56      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      0.60      0.10         5
           4       0.50      0.15      0.24        13
           5       0.05      0.23      0.08        13
           6       0.72      0.78      0.75        37

    accuracy                           0.23       163
   macro avg       0.19      0.25      0.17       163
weighted avg       0.21      0.23      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.77      0.24      0.36      1567
           1       0.55      0.93      0.69      1562
           2       0.61      0.57      0.59      1163
           3       0.44      0.28      0.34      1429
           4       0.72      0.15      0.24       755
           5       0.31      0.62      0.41      1564
           6       0.47      0.35      0.40      1510

    accuracy                           0.47      9550
   macro avg       0.55      0.45      0.43      9550
weighted avg       0.54      0.47      0.45      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.25      0.37      0.30       963
           1       0.26      0.47      0.33       619
           2       0.74      0.07      0.13      1298
           3       0.18      0.42      0.25      1586
           4       0.31      0.19      0.23       984
           5       0.09      0.11      0.10      1439
           6       0.95      0.20      0.34      2304

    accuracy                           0.24      9193
   macro avg       0.40      0.26      0.24      9193
weighted avg       0.46      0.24      0.24      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.44      0.18      0.26       267
           1       0.74      0.91      0.81       666
           2       0.03      0.82      0.06        45
           3       0.96      0.02      0.03      1763
           4       0.26      0.57      0.36       188
           5       0.10      0.01      0.02      2151
           6       0.47      0.97      0.64      2196

    accuracy                           0.41      7276
   macro avg       0.43      0.50      0.31      7276
weighted avg       0.50      0.41      0.30      7276


=== Overall Accuracy ===
0.4245612213582959 [0.508781558726674, 0.5453566299664911, 0.5719232433992208, 0.22699386503067484, 0.4700523560209424, 0.23953007723267702, 0.40929081913139087]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.895746     0.565127     0.940765 0.565037 0.940785
  FUSHomozygous_Untreated  0.906166     0.667051     0.937546 0.583621 0.955470
   FUSRevertant_Untreated  0.905803     0.594432     0.942288 0.546870 0.951989
           OPTN_Untreated  0.777294     0.407656     0.862850 0.407573 0.862891
           TBK1_Untreated  0.875957     0.321658     0.948726 0.451628 0.914188
          TDP43_Untreated  0.741226     0.239359     0.849572 0.255683 0.838022
             WT_Untreated  0.805767     0.506873     0.870361 0.457986 0.890913
            Macro Average  0.843994     0.471737     0.907444 0.466914 0.907751
In [20]:
from cuml.svm import SVC as cuSVC
run_baseline_model(
    classifier_class=cuSVC,
    classifier_kwargs={"kernel": "rbf", "C": 1.0, "gamma": "scale"},
    norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.65      0.89      0.75      1222
           1       0.83      0.52      0.64      1245
           2       0.89      0.85      0.87      1015
           3       0.71      0.63      0.67      2314
           4       0.00      0.00      0.00      1876
           5       0.30      0.55      0.39      1699
           6       0.40      0.58      0.47      1561

    accuracy                           0.54     10932
   macro avg       0.54      0.58      0.54     10932
weighted avg       0.50      0.54      0.51     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.60      0.79      0.68      1231
           1       0.68      0.44      0.53      1162
           2       0.40      1.00      0.57       800
           3       0.52      0.74      0.61      1649
           4       0.66      0.45      0.54      1220
           5       0.27      0.05      0.08      1508
           6       0.76      0.55      0.64       786

    accuracy                           0.55      8356
   macro avg       0.56      0.57      0.52      8356
weighted avg       0.54      0.55      0.50      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.70      0.98      0.82      1004
           1       0.95      0.47      0.63       800
           2       0.79      0.69      0.74      1131
           3       0.41      0.38      0.40      1103
           4       0.40      0.87      0.55      1045
           5       0.54      0.25      0.35       930
           6       0.55      0.25      0.35       918

    accuracy                           0.57      6931
   macro avg       0.62      0.56      0.55      6931
weighted avg       0.61      0.57      0.55      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      1.00      0.10         5
           4       0.00      0.00      0.00        13
           5       0.03      0.08      0.04        13
           6       0.84      0.84      0.84        37

    accuracy                           0.23       163
   macro avg       0.13      0.27      0.14       163
weighted avg       0.19      0.23      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.87      0.15      0.26      1567
           1       0.53      0.97      0.69      1562
           2       0.63      0.33      0.43      1163
           3       0.44      0.41      0.42      1429
           4       0.86      0.11      0.19       755
           5       0.28      0.69      0.39      1564
           6       0.41      0.13      0.20      1510

    accuracy                           0.43      9550
   macro avg       0.57      0.40      0.37      9550
weighted avg       0.55      0.43      0.38      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.19      0.24      0.21       963
           1       0.29      0.62      0.40       619
           2       0.88      0.04      0.08      1298
           3       0.19      0.38      0.25      1586
           4       0.40      0.16      0.23       984
           5       0.09      0.13      0.11      1439
           6       0.99      0.42      0.59      2304

    accuracy                           0.28      9193
   macro avg       0.43      0.28      0.27      9193
weighted avg       0.50      0.28      0.29      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.06      0.01      0.01       267
           1       0.71      0.95      0.81       666
           2       0.02      0.56      0.03        45
           3       0.97      0.02      0.04      1763
           4       0.26      0.45      0.33       188
           5       0.05      0.01      0.01      2151
           6       0.51      0.97      0.67      2196

    accuracy                           0.40      7276
   macro avg       0.37      0.42      0.27      7276
weighted avg       0.48      0.40      0.30      7276


=== Overall Accuracy ===
0.4280018711276649 [0.5405232345407976, 0.5460746768788894, 0.569470494878084, 0.22699386503067484, 0.42879581151832463, 0.2817361035570543, 0.40241891148982956]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.895097     0.561146     0.940569 0.562490 0.940264
  FUSHomozygous_Untreated  0.906376     0.670012     0.937395 0.584110 0.955842
   FUSRevertant_Untreated  0.887655     0.529294     0.929645 0.468513 0.943995
           OPTN_Untreated  0.785348     0.441974     0.864824 0.430777 0.870059
           TBK1_Untreated  0.876930     0.293702     0.953497 0.453299 0.911372
          TDP43_Untreated  0.728383     0.269454     0.827459 0.252137 0.839912
             WT_Untreated  0.838858     0.528028     0.906032 0.548405 0.898814
            Macro Average  0.845521     0.470516     0.908489 0.471390 0.908608
In [23]:
from cuml.neighbors import KNeighborsClassifier as cuKNNClassifier
run_baseline_model(
    classifier_class=cuKNNClassifier,
    classifier_kwargs={"n_neighbors": 5},
    norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.39      0.47      0.43      1222
           1       0.50      0.75      0.60      1245
           2       0.76      0.25      0.37      1015
           3       0.42      0.43      0.43      2314
           4       0.17      0.05      0.07      1876
           5       0.27      0.66      0.38      1699
           6       0.52      0.06      0.11      1561

    accuracy                           0.37     10932
   macro avg       0.43      0.38      0.34     10932
weighted avg       0.40      0.37      0.33     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.57      0.41      0.48      1231
           1       0.53      0.72      0.61      1162
           2       0.35      0.86      0.49       800
           3       0.44      0.51      0.47      1649
           4       0.33      0.31      0.32      1220
           5       0.27      0.12      0.17      1508
           6       0.53      0.11      0.18       786

    accuracy                           0.42      8356
   macro avg       0.43      0.44      0.39      8356
weighted avg       0.42      0.42      0.39      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.46      0.78      0.58      1004
           1       0.45      0.27      0.33       800
           2       0.66      0.10      0.18      1131
           3       0.31      0.30      0.31      1103
           4       0.29      0.82      0.43      1045
           5       0.36      0.11      0.17       930
           6       0.41      0.10      0.16       918

    accuracy                           0.36      6931
   macro avg       0.42      0.36      0.31      6931
weighted avg       0.42      0.36      0.31      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.69      0.35      0.46        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.04      1.00      0.08         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.81      0.70      0.75        37

    accuracy                           0.25       163
   macro avg       0.22      0.29      0.19       163
weighted avg       0.30      0.25      0.25       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.61      0.38      0.47      1567
           1       0.54      0.73      0.62      1562
           2       0.21      0.14      0.17      1163
           3       0.25      0.32      0.28      1429
           4       0.13      0.11      0.12       755
           5       0.24      0.38      0.29      1564
           6       0.39      0.19      0.26      1510

    accuracy                           0.35      9550
   macro avg       0.34      0.32      0.31      9550
weighted avg       0.36      0.35      0.34      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.25      0.34      0.29       963
           1       0.22      0.77      0.35       619
           2       0.36      0.05      0.09      1298
           3       0.24      0.21      0.23      1586
           4       0.10      0.01      0.02       984
           5       0.15      0.23      0.18      1439
           6       0.68      0.54      0.60      2304

    accuracy                           0.31      9193
   macro avg       0.29      0.31      0.25      9193
weighted avg       0.34      0.31      0.29      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.05      0.09      0.06       267
           1       0.45      0.39      0.42       666
           2       0.01      0.47      0.02        45
           3       0.69      0.03      0.06      1763
           4       0.14      0.06      0.09       188
           5       0.19      0.03      0.05      2151
           6       0.54      0.89      0.67      2196

    accuracy                           0.33      7276
   macro avg       0.29      0.28      0.19      7276
weighted avg       0.43      0.33      0.27      7276


=== Overall Accuracy ===
0.33995055533638296 [0.37193560190267105, 0.42161321206318814, 0.3612754292309912, 0.24539877300613497, 0.34701570680628274, 0.305449798759926, 0.32696536558548656]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.855690     0.450796     0.910822 0.407690 0.924126
  FUSHomozygous_Untreated  0.864449     0.634973     0.894564 0.441446 0.949172
   FUSRevertant_Untreated  0.839125     0.238173     0.909541 0.235771 0.910628
           OPTN_Untreated  0.761092     0.306630     0.866281 0.346728 0.843698
           TBK1_Untreated  0.833973     0.235981     0.912478 0.261432 0.900964
          TDP43_Untreated  0.719528     0.256664     0.819454 0.234831 0.836238
             WT_Untreated  0.837350     0.407109     0.930330 0.558074 0.878947
            Macro Average  0.815887     0.361475     0.891924 0.355139 0.891967
In [16]:
from cuml.neighbors import KNeighborsClassifier as cuKNNClassifier
run_baseline_model(
    classifier_class=cuKNNClassifier,
    classifier_kwargs={"n_neighbors": 50},
    norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.47      0.39      0.43      1222
           1       0.57      0.90      0.70      1245
           2       0.96      0.17      0.30      1015
           3       0.52      0.45      0.48      2314
           4       0.05      0.01      0.02      1876
           5       0.27      0.83      0.41      1699
           6       0.61      0.02      0.04      1561

    accuracy                           0.39     10932
   macro avg       0.49      0.40      0.34     10932
weighted avg       0.46      0.39      0.33     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.67      0.36      0.46      1231
           1       0.57      0.80      0.67      1162
           2       0.41      0.94      0.57       800
           3       0.52      0.64      0.57      1649
           4       0.44      0.44      0.44      1220
           5       0.30      0.17      0.21      1508
           6       0.72      0.11      0.19       786

    accuracy                           0.48      8356
   macro avg       0.52      0.49      0.44      8356
weighted avg       0.50      0.48      0.45      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.56      0.86      0.68      1004
           1       0.62      0.30      0.40       800
           2       0.88      0.05      0.10      1131
           3       0.30      0.23      0.26      1103
           4       0.26      0.96      0.41      1045
           5       0.42      0.06      0.10       930
           6       0.56      0.06      0.12       918

    accuracy                           0.36      6931
   macro avg       0.51      0.36      0.29      6931
weighted avg       0.51      0.36      0.29      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.70      0.27      0.39        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      1.00      0.09         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.78      0.84      0.81        37

    accuracy                           0.26       163
   macro avg       0.22      0.30      0.18       163
weighted avg       0.29      0.26      0.25       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.74      0.23      0.35      1567
           1       0.54      0.86      0.67      1562
           2       0.30      0.04      0.07      1163
           3       0.30      0.28      0.29      1429
           4       0.16      0.09      0.12       755
           5       0.26      0.60      0.36      1564
           6       0.45      0.29      0.35      1510

    accuracy                           0.38      9550
   macro avg       0.39      0.34      0.32      9550
weighted avg       0.42      0.38      0.34      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.31      0.28      0.29       963
           1       0.27      0.86      0.41       619
           2       0.38      0.01      0.02      1298
           3       0.26      0.23      0.25      1586
           4       0.06      0.00      0.01       984
           5       0.19      0.34      0.24      1439
           6       0.73      0.75      0.74      2304

    accuracy                           0.37      9193
   macro avg       0.31      0.35      0.28      9193
weighted avg       0.37      0.37      0.33      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.01      0.02      0.02       267
           1       0.50      0.39      0.44       666
           2       0.02      0.67      0.04        45
           3       0.88      0.01      0.02      1763
           4       0.09      0.02      0.03       188
           5       0.08      0.01      0.01      2151
           6       0.46      0.99      0.63      2196

    accuracy                           0.34      7276
   macro avg       0.29      0.30      0.17      7276
weighted avg       0.43      0.34      0.24      7276


=== Overall Accuracy ===
0.37046544679203297 [0.39096231247713137, 0.48468166586883676, 0.3648824123503102, 0.26380368098159507, 0.3769633507853403, 0.3691939519199391, 0.3427707531610775]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.876987     0.386146     0.943822 0.483453 0.918645
  FUSHomozygous_Untreated  0.882254     0.726929     0.902638 0.494904 0.961815
   FUSRevertant_Untreated  0.866968     0.195051     0.945699 0.296214 0.909311
           OPTN_Untreated  0.783897     0.318611     0.891591 0.404851 0.849698
           TBK1_Untreated  0.829278     0.269692     0.902742 0.266884 0.903991
          TDP43_Untreated  0.702658     0.339209     0.781122 0.250695 0.845574
             WT_Untreated  0.835957     0.486899     0.911393 0.542864 0.891530
            Macro Average  0.825429     0.388934     0.897001 0.391409 0.897223
In [9]:
from xgboost import XGBClassifier
run_baseline_model(
    classifier_class=XGBClassifier,
    classifier_kwargs={
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "n_estimators": 100,
        "max_depth": 6,
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    },
    norm=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:43:55] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:43:55] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:44:58] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.58      0.87      0.70      1222
           1       0.76      0.38      0.51      1245
           2       0.82      0.62      0.71      1015
           3       0.55      0.64      0.59      2314
           4       0.06      0.00      0.01      1876
           5       0.31      0.39      0.35      1699
           6       0.26      0.46      0.33      1561

    accuracy                           0.46     10932
   macro avg       0.48      0.48      0.46     10932
weighted avg       0.44      0.46      0.43     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:45:37] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:45:37] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:31] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.57      0.73      0.64      1231
           1       0.60      0.41      0.49      1162
           2       0.36      0.98      0.53       800
           3       0.49      0.58      0.53      1649
           4       0.60      0.32      0.42      1220
           5       0.30      0.09      0.14      1508
           6       0.57      0.55      0.56       786

    accuracy                           0.49      8356
   macro avg       0.50      0.52      0.47      8356
weighted avg       0.49      0.49      0.46      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:46] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:46:46] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:41] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.63      0.94      0.75      1004
           1       0.83      0.29      0.43       800
           2       0.72      0.34      0.46      1131
           3       0.33      0.32      0.32      1103
           4       0.39      0.87      0.53      1045
           5       0.39      0.28      0.33       930
           6       0.37      0.20      0.26       918

    accuracy                           0.47      6931
   macro avg       0.52      0.46      0.44      6931
weighted avg       0.51      0.47      0.44      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:58] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:47:58] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:48:57] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.30      0.12      0.17        26
           1       0.67      0.08      0.14        25
           2       0.00      0.00      0.00        44
           3       0.04      0.40      0.08         5
           4       0.25      0.15      0.19        13
           5       0.11      0.46      0.18        13
           6       0.72      0.78      0.75        37

    accuracy                           0.27       163
   macro avg       0.30      0.28      0.22       163
weighted avg       0.34      0.27      0.25       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:49:13] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:49:13] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:07] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.81      0.10      0.18      1567
           1       0.52      0.97      0.68      1562
           2       0.38      0.16      0.23      1163
           3       0.33      0.26      0.29      1429
           4       0.59      0.09      0.15       755
           5       0.25      0.61      0.36      1564
           6       0.30      0.19      0.24      1510

    accuracy                           0.37      9550
   macro avg       0.46      0.34      0.30      9550
weighted avg       0.45      0.37      0.32      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:42] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:50:42] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:51:42] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.12      0.19      0.14       963
           1       0.24      0.76      0.36       619
           2       0.93      0.03      0.06      1298
           3       0.05      0.09      0.06      1586
           4       0.12      0.04      0.06       984
           5       0.03      0.04      0.03      1439
           6       0.90      0.20      0.32      2304

    accuracy                           0.15      9193
   macro avg       0.34      0.19      0.15      9193
weighted avg       0.41      0.15      0.15      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:52:27] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:52:27] WARNING: /workspace/src/learner.cc:740: 
Parameters: { "predictor", "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/xgboost/core.py:158: UserWarning: [12:53:28] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
              precision    recall  f1-score   support

           0       0.25      0.07      0.11       267
           1       0.71      0.92      0.80       666
           2       0.03      0.80      0.06        45
           3       0.96      0.01      0.03      1763
           4       0.25      0.24      0.25       188
           5       0.05      0.01      0.01      2151
           6       0.44      0.95      0.60      2196

    accuracy                           0.39      7276
   macro avg       0.38      0.43      0.26      7276
weighted avg       0.46      0.39      0.27      7276


=== Overall Accuracy ===
0.3713374682263305 [0.46084888401024515, 0.48851124940162755, 0.47150483335737986, 0.26993865030674846, 0.37026178010471206, 0.1498966605025563, 0.3884002199010445]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.876930     0.521338     0.925349 0.487420 0.934200
  FUSHomozygous_Untreated  0.885785     0.621155     0.920513 0.506302 0.948757
   FUSRevertant_Untreated  0.875728     0.377365     0.934122 0.401627 0.927557
           OPTN_Untreated  0.748841     0.337395     0.844073 0.333702 0.846241
           TBK1_Untreated  0.868819     0.239270     0.951468 0.392925 0.905006
          TDP43_Untreated  0.729032     0.224312     0.837993 0.230125 0.833449
             WT_Untreated  0.784775     0.449205     0.857295 0.404859 0.878081
            Macro Average  0.824273     0.395720     0.895831 0.393851 0.896184
In [15]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (100,),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    norm=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.50      0.90      0.64      1222
           1       0.72      0.32      0.44      1245
           2       0.86      0.88      0.87      1015
           3       0.72      0.63      0.67      2314
           4       0.09      0.00      0.00      1876
           5       0.31      0.38      0.34      1699
           6       0.38      0.73      0.50      1561

    accuracy                           0.52     10932
   macro avg       0.51      0.55      0.50     10932
weighted avg       0.49      0.52      0.47     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.54      0.77      0.63      1231
           1       0.67      0.31      0.42      1162
           2       0.28      1.00      0.44       800
           3       0.60      0.56      0.58      1649
           4       0.72      0.29      0.41      1220
           5       0.25      0.06      0.09      1508
           6       0.60      0.64      0.62       786

    accuracy                           0.47      8356
   macro avg       0.52      0.52      0.46      8356
weighted avg       0.52      0.47      0.44      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.63      0.96      0.76      1004
           1       0.89      0.39      0.55       800
           2       0.65      0.80      0.72      1131
           3       0.42      0.25      0.31      1103
           4       0.46      0.76      0.57      1045
           5       0.63      0.23      0.34       930
           6       0.56      0.58      0.57       918

    accuracy                           0.58      6931
   macro avg       0.61      0.57      0.55      6931
weighted avg       0.60      0.58      0.55      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      0.60      0.09         5
           4       1.00      0.08      0.14        13
           5       0.05      0.23      0.09        13
           6       0.69      0.78      0.73        37

    accuracy                           0.22       163
   macro avg       0.26      0.24      0.15       163
weighted avg       0.24      0.22      0.19       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.84      0.26      0.40      1567
           1       0.55      0.94      0.69      1562
           2       0.52      0.48      0.50      1163
           3       0.33      0.43      0.38      1429
           4       0.84      0.18      0.30       755
           5       0.24      0.44      0.31      1564
           6       0.41      0.10      0.16      1510

    accuracy                           0.42      9550
   macro avg       0.53      0.41      0.39      9550
weighted avg       0.51      0.42      0.40      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.26      0.29      0.28       963
           1       0.25      0.53      0.34       619
           2       0.72      0.09      0.16      1298
           3       0.31      0.59      0.40      1586
           4       0.40      0.25      0.31       984
           5       0.13      0.13      0.13      1439
           6       0.94      0.64      0.76      2304

    accuracy                           0.39      9193
   macro avg       0.43      0.36      0.34      9193
weighted avg       0.50      0.39      0.39      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       267
           1       0.72      0.86      0.78       666
           2       0.01      0.22      0.01        45
           3       1.00      0.01      0.01      1763
           4       0.25      0.57      0.34       188
           5       0.05      0.00      0.01      2151
           6       0.55      0.95      0.70      2196

    accuracy                           0.39      7276
   macro avg       0.37      0.37      0.26      7276
weighted avg       0.49      0.39      0.30      7276


=== Overall Accuracy ===
0.4262872123601977 [0.5162824734723747, 0.4743896601244615, 0.5765401817919492, 0.22085889570552147, 0.42303664921465967, 0.3872511693679974, 0.38565145684442004]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.885880     0.587420     0.926519 0.521192 0.942833
  FUSHomozygous_Untreated  0.896471     0.566376     0.939791 0.552471 0.942905
   FUSRevertant_Untreated  0.858953     0.598071     0.889521 0.388121 0.949718
           OPTN_Untreated  0.797771     0.428267     0.883296 0.459277 0.869704
           TBK1_Untreated  0.880269     0.270021     0.960384 0.472246 0.909267
          TDP43_Untreated  0.752199     0.197120     0.872033 0.249558 0.834192
             WT_Untreated  0.846224     0.637135     0.891411 0.559084 0.919141
            Macro Average  0.845396     0.469201     0.908994 0.457421 0.909680
In [17]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (100,),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    norm=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.53      0.90      0.67      1222
           1       0.71      0.20      0.31      1245
           2       0.82      0.86      0.84      1015
           3       0.77      0.68      0.72      2314
           4       0.07      0.00      0.00      1876
           5       0.30      0.63      0.40      1699
           6       0.40      0.46      0.43      1561

    accuracy                           0.51     10932
   macro avg       0.51      0.53      0.48     10932
weighted avg       0.50      0.51      0.47     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.58      0.77      0.66      1231
           1       0.66      0.38      0.48      1162
           2       0.40      0.99      0.57       800
           3       0.62      0.67      0.64      1649
           4       0.74      0.47      0.57      1220
           5       0.40      0.15      0.22      1508
           6       0.54      0.64      0.58       786

    accuracy                           0.55      8356
   macro avg       0.56      0.58      0.53      8356
weighted avg       0.57      0.55      0.52      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.65      0.94      0.77      1004
           1       0.82      0.39      0.53       800
           2       0.58      0.76      0.66      1131
           3       0.44      0.26      0.33      1103
           4       0.37      0.68      0.48      1045
           5       0.43      0.16      0.23       930
           6       0.58      0.45      0.51       918

    accuracy                           0.53      6931
   macro avg       0.55      0.52      0.50      6931
weighted avg       0.54      0.53      0.50      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.50      0.04      0.07        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.07      0.80      0.12         5
           4       0.75      0.23      0.35        13
           5       0.06      0.15      0.09        13
           6       0.54      0.95      0.69        37

    accuracy                           0.28       163
   macro avg       0.27      0.31      0.19       163
weighted avg       0.27      0.28      0.21       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.89      0.12      0.22      1567
           1       0.53      0.98      0.69      1562
           2       0.70      0.57      0.63      1163
           3       0.36      0.40      0.38      1429
           4       0.78      0.22      0.35       755
           5       0.30      0.66      0.41      1564
           6       0.31      0.06      0.10      1510

    accuracy                           0.44      9550
   macro avg       0.55      0.43      0.40      9550
weighted avg       0.53      0.44      0.39      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.33      0.24      0.28       963
           1       0.22      0.70      0.34       619
           2       0.63      0.15      0.24      1298
           3       0.30      0.49      0.37      1586
           4       0.47      0.49      0.48       984
           5       0.15      0.18      0.16      1439
           6       0.98      0.34      0.51      2304

    accuracy                           0.35      9193
   macro avg       0.44      0.37      0.34      9193
weighted avg       0.51      0.35      0.35      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.04      0.01      0.02       267
           1       0.69      0.87      0.77       666
           2       0.01      0.51      0.02        45
           3       1.00      0.02      0.04      1763
           4       0.22      0.59      0.32       188
           5       0.18      0.02      0.04      2151
           6       0.55      0.88      0.68      2196

    accuracy                           0.38      7276
   macro avg       0.38      0.42      0.27      7276
weighted avg       0.53      0.38      0.30      7276


=== Overall Accuracy ===
0.43327501133055335 [0.5113428466886205, 0.549186213499282, 0.530370797864666, 0.27607361963190186, 0.44418848167539265, 0.34504514304362016, 0.37671797691039033]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.892865     0.545541     0.940157 0.553831 0.938245
  FUSHomozygous_Untreated  0.883914     0.583155     0.923384 0.499718 0.944070
   FUSRevertant_Untreated  0.876453     0.618086     0.906726 0.437082 0.952968
           OPTN_Untreated  0.811721     0.442685     0.897138 0.499027 0.874290
           TBK1_Untreated  0.876758     0.338431     0.947431 0.458046 0.916027
          TDP43_Untreated  0.738287     0.299656     0.832981 0.279191 0.846375
             WT_Untreated  0.838114     0.481207     0.915245 0.550965 0.890869
            Macro Average  0.845445     0.472680     0.909009 0.468266 0.908978
In [30]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (200,),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    norm=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.50      0.79      0.62      1222
           1       0.56      0.23      0.33      1245
           2       0.82      0.89      0.85      1015
           3       0.78      0.61      0.69      2314
           4       0.07      0.00      0.00      1876
           5       0.27      0.42      0.33      1699
           6       0.32      0.59      0.41      1561

    accuracy                           0.48     10932
   macro avg       0.47      0.51      0.46     10932
weighted avg       0.46      0.48      0.44     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.56      0.83      0.67      1231
           1       0.69      0.31      0.42      1162
           2       0.40      1.00      0.57       800
           3       0.63      0.67      0.65      1649
           4       0.69      0.37      0.48      1220
           5       0.42      0.12      0.19      1508
           6       0.50      0.73      0.59       786

    accuracy                           0.54      8356
   macro avg       0.56      0.58      0.51      8356
weighted avg       0.57      0.54      0.50      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.66      0.97      0.79      1004
           1       0.92      0.41      0.57       800
           2       0.68      0.76      0.72      1131
           3       0.39      0.18      0.25      1103
           4       0.39      0.81      0.53      1045
           5       0.46      0.22      0.30       930
           6       0.57      0.43      0.49       918

    accuracy                           0.55      6931
   macro avg       0.58      0.54      0.52      6931
weighted avg       0.57      0.55      0.52      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       1.00      0.12      0.21        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.06      0.60      0.10         5
           4       0.75      0.23      0.35        13
           5       0.08      0.31      0.13        13
           6       0.65      0.95      0.77        37

    accuracy                           0.29       163
   macro avg       0.36      0.31      0.22       163
weighted avg       0.37      0.29      0.25       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.86      0.15      0.25      1567
           1       0.53      0.97      0.69      1562
           2       0.66      0.60      0.63      1163
           3       0.38      0.51      0.43      1429
           4       0.77      0.22      0.34       755
           5       0.27      0.51      0.35      1564
           6       0.43      0.08      0.13      1510

    accuracy                           0.44      9550
   macro avg       0.56      0.43      0.40      9550
weighted avg       0.54      0.44      0.40      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.43      0.34      0.38       963
           1       0.27      0.69      0.39       619
           2       0.74      0.16      0.26      1298
           3       0.31      0.52      0.39      1586
           4       0.57      0.37      0.45       984
           5       0.13      0.22      0.16      1439
           6       0.99      0.33      0.49      2304

    accuracy                           0.35      9193
   macro avg       0.49      0.38      0.36      9193
weighted avg       0.55      0.35      0.37      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.01      0.00      0.01       267
           1       0.71      0.88      0.78       666
           2       0.02      0.51      0.03        45
           3       1.00      0.01      0.01      1763
           4       0.25      0.60      0.35       188
           5       0.07      0.01      0.02      2151
           6       0.51      0.97      0.67      2196

    accuracy                           0.40      7276
   macro avg       0.37      0.43      0.27      7276
weighted avg       0.49      0.40      0.29      7276


=== Overall Accuracy ===
0.43613351337378764 [0.47676545920234176, 0.5385351842987075, 0.551435579281489, 0.294478527607362, 0.4449214659685864, 0.3507016207984336, 0.3960967564595932]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.894353     0.561146     0.939724 0.559010 0.940213
  FUSHomozygous_Untreated  0.890327     0.577233     0.931415 0.524828 0.943782
   FUSRevertant_Untreated  0.891949     0.633552     0.922226 0.488359 0.955512
           OPTN_Untreated  0.809088     0.436389     0.895352 0.491144 0.872829
           TBK1_Untreated  0.878628     0.320506     0.951900 0.466603 0.914317
          TDP43_Untreated  0.730482     0.241294     0.836091 0.241164 0.836188
             WT_Untreated  0.818782     0.529424     0.881315 0.490840 0.896546
            Macro Average  0.844801     0.471364     0.908289 0.465993 0.908484
In [18]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (50,),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    apply_pca = True,
    pca_components = 100
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.38      0.53      0.44      1222
           1       0.24      0.14      0.18      1245
           2       0.59      0.68      0.63      1015
           3       0.47      0.52      0.50      2314
           4       0.03      0.00      0.00      1876
           5       0.31      0.59      0.41      1699
           6       0.37      0.33      0.35      1561

    accuracy                           0.39     10932
   macro avg       0.34      0.40      0.36     10932
weighted avg       0.33      0.39      0.35     10932

here!
here2!
here3!

=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.59      0.79      0.67      1231
           1       0.69      0.40      0.50      1162
           2       0.23      0.98      0.38       800
           3       0.53      0.47      0.50      1649
           4       0.58      0.18      0.27      1220
           5       0.13      0.02      0.03      1508
           6       0.56      0.45      0.50       786

    accuracy                           0.43      8356
   macro avg       0.47      0.47      0.41      8356
weighted avg       0.47      0.43      0.40      8356

here!
here2!
here3!

=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.63      0.83      0.72      1004
           1       0.66      0.40      0.50       800
           2       0.74      0.53      0.62      1131
           3       0.27      0.21      0.24      1103
           4       0.37      0.84      0.52      1045
           5       0.40      0.11      0.18       930
           6       0.44      0.38      0.41       918

    accuracy                           0.48      6931
   macro avg       0.50      0.47      0.45      6931
weighted avg       0.50      0.48      0.46      6931

here!
here2!
here3!

=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.56      0.35      0.43        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.06      1.00      0.11         5
           4       0.00      0.00      0.00        13
           5       0.03      0.08      0.04        13
           6       0.96      0.59      0.73        37

    accuracy                           0.23       163
   macro avg       0.23      0.29      0.19       163
weighted avg       0.31      0.23      0.24       163

here!
here2!
here3!

=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.94      0.19      0.31      1567
           1       0.55      0.98      0.70      1562
           2       0.28      0.14      0.19      1163
           3       0.25      0.36      0.30      1429
           4       0.43      0.30      0.36       755
           5       0.26      0.43      0.33      1564
           6       0.30      0.15      0.20      1510

    accuracy                           0.38      9550
   macro avg       0.43      0.36      0.34      9550
weighted avg       0.44      0.38      0.35      9550

here!
here2!
here3!

=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.16      0.09      0.11       963
           1       0.23      0.81      0.36       619
           2       0.14      0.04      0.06      1298
           3       0.32      0.40      0.36      1586
           4       0.39      0.38      0.38       984
           5       0.15      0.22      0.18      1439
           6       0.90      0.43      0.58      2304

    accuracy                           0.32      9193
   macro avg       0.33      0.34      0.29      9193
weighted avg       0.40      0.32      0.32      9193

here!
here2!
here3!

=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
              precision    recall  f1-score   support

           0       0.02      0.01      0.01       267
           1       0.66      0.76      0.71       666
           2       0.00      0.27      0.01        45
           3       0.90      0.01      0.01      1763
           4       0.22      0.25      0.23       188
           5       0.02      0.00      0.00      2151
           6       0.62      0.84      0.72      2196

    accuracy                           0.33      7276
   macro avg       0.35      0.30      0.24      7276
weighted avg       0.48      0.33      0.29      7276

here!
here2!
here3!

=== Overall Accuracy ===
0.36560145872024213 [0.38922429564581046, 0.43011010052656773, 0.47972875486942723, 0.22699386503067484, 0.37926701570680627, 0.320461220493854, 0.33342495876855416]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.880327     0.452229     0.938618 0.500794 0.926386
  FUSHomozygous_Untreated  0.872216     0.574272     0.911316 0.459403 0.942235
   FUSRevertant_Untreated  0.804183     0.419760     0.849227 0.245975 0.925875
           OPTN_Untreated  0.768554     0.345111     0.866563 0.374463 0.851122
           TBK1_Untreated  0.863037     0.287124     0.938644 0.380558 0.909334
          TDP43_Untreated  0.742581     0.228289     0.853609 0.251868 0.836699
             WT_Untreated  0.840270     0.461555     0.922115 0.561536 0.887948
            Macro Average  0.824453     0.395477     0.897156 0.396371 0.897086
In [28]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (100,),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    apply_pca = True,
    pca_components = 200
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.46      0.79      0.58      1222
           1       0.43      0.14      0.22      1245
           2       0.61      0.78      0.69      1015
           3       0.54      0.45      0.49      2314
           4       0.07      0.00      0.01      1876
           5       0.28      0.39      0.33      1699
           6       0.30      0.53      0.38      1561

    accuracy                           0.41     10932
   macro avg       0.39      0.44      0.39     10932
weighted avg       0.37      0.41      0.36     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.60      0.74      0.67      1231
           1       0.73      0.45      0.56      1162
           2       0.27      0.98      0.43       800
           3       0.54      0.40      0.46      1649
           4       0.49      0.32      0.39      1220
           5       0.17      0.05      0.08      1508
           6       0.54      0.51      0.52       786

    accuracy                           0.45      8356
   macro avg       0.48      0.49      0.44      8356
weighted avg       0.48      0.45      0.43      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.65      0.91      0.76      1004
           1       0.80      0.41      0.54       800
           2       0.71      0.61      0.66      1131
           3       0.29      0.20      0.24      1103
           4       0.37      0.83      0.51      1045
           5       0.43      0.14      0.21       930
           6       0.55      0.44      0.49       918

    accuracy                           0.51      6931
   macro avg       0.54      0.51      0.49      6931
weighted avg       0.53      0.51      0.49      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.80      0.15      0.26        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.04      1.00      0.09         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.68      0.51      0.58        37

    accuracy                           0.17       163
   macro avg       0.22      0.24      0.13       163
weighted avg       0.28      0.17      0.18       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.83      0.20      0.32      1567
           1       0.54      0.92      0.68      1562
           2       0.45      0.24      0.31      1163
           3       0.29      0.46      0.36      1429
           4       0.58      0.20      0.30       755
           5       0.20      0.30      0.24      1564
           6       0.31      0.20      0.24      1510

    accuracy                           0.38      9550
   macro avg       0.46      0.36      0.35      9550
weighted avg       0.45      0.38      0.36      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.30      0.22      0.26       963
           1       0.24      0.70      0.35       619
           2       0.25      0.06      0.10      1298
           3       0.33      0.43      0.38      1586
           4       0.33      0.29      0.31       984
           5       0.17      0.24      0.20      1439
           6       0.87      0.50      0.63      2304

    accuracy                           0.35      9193
   macro avg       0.35      0.35      0.32      9193
weighted avg       0.42      0.35      0.35      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.01      0.00      0.00       267
           1       0.65      0.72      0.68       666
           2       0.00      0.18      0.01        45
           3       0.87      0.01      0.03      1763
           4       0.22      0.42      0.29       188
           5       0.10      0.01      0.02      2151
           6       0.53      0.91      0.67      2196

    accuracy                           0.36      7276
   macro avg       0.34      0.32      0.24      7276
weighted avg       0.46      0.36      0.29      7276


=== Overall Accuracy ===
0.3759157995635852 [0.4101719721917307, 0.45021541407371946, 0.5129129995671621, 0.17177914110429449, 0.37863874345549736, 0.3473294898292179, 0.3603628367234744]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.887101     0.530414     0.935669 0.528898 0.936035
  FUSHomozygous_Untreated  0.883266     0.557164     0.926061 0.497211 0.940951
   FUSRevertant_Untreated  0.841034     0.479803     0.883360 0.325234 0.935453
           OPTN_Untreated  0.778821     0.336278     0.881251 0.395935 0.851553
           TBK1_Untreated  0.861701     0.293866     0.936248 0.377004 0.909905
          TDP43_Untreated  0.738994     0.183147     0.858993 0.218995 0.829673
             WT_Untreated  0.820519     0.547895     0.879436 0.495484 0.900010
            Macro Average  0.830205     0.418367     0.900145 0.405537 0.900511
In [29]:
from sklearn.neural_network import MLPClassifier
run_baseline_model(
    classifier_class=MLPClassifier,
    classifier_kwargs={
        "hidden_layer_sizes": (100,50),  # You can adjust e.g., (256, 128)
        "activation": "relu",
        "solver": "adam",
        "max_iter": 200,
        "random_state": 42
    },
    apply_pca = False,
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.50      0.90      0.65      1222
           1       0.58      0.12      0.20      1245
           2       0.73      0.90      0.80      1015
           3       0.79      0.67      0.72      2314
           4       0.11      0.00      0.00      1876
           5       0.28      0.43      0.34      1699
           6       0.31      0.50      0.38      1561

    accuracy                           0.48     10932
   macro avg       0.47      0.50      0.44     10932
weighted avg       0.46      0.48      0.43     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.59      0.85      0.70      1231
           1       0.74      0.36      0.48      1162
           2       0.34      1.00      0.51       800
           3       0.67      0.61      0.64      1649
           4       0.72      0.38      0.50      1220
           5       0.33      0.12      0.18      1508
           6       0.57      0.72      0.64       786

    accuracy                           0.54      8356
   macro avg       0.57      0.58      0.52      8356
weighted avg       0.57      0.54      0.51      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.62      0.97      0.76      1004
           1       0.88      0.29      0.43       800
           2       0.65      0.78      0.71      1131
           3       0.38      0.28      0.32      1103
           4       0.42      0.75      0.54      1045
           5       0.38      0.10      0.16       930
           6       0.58      0.49      0.53       918

    accuracy                           0.54      6931
   macro avg       0.56      0.52      0.49      6931
weighted avg       0.55      0.54      0.50      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.33      0.04      0.07        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.05      0.60      0.08         5
           4       0.67      0.15      0.25        13
           5       0.12      0.23      0.16        13
           6       0.53      0.95      0.68        37

    accuracy                           0.27       163
   macro avg       0.24      0.28      0.18       163
weighted avg       0.24      0.27      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.93      0.11      0.20      1567
           1       0.52      0.98      0.68      1562
           2       0.71      0.52      0.60      1163
           3       0.35      0.42      0.38      1429
           4       0.68      0.17      0.28       755
           5       0.28      0.59      0.38      1564
           6       0.37      0.10      0.15      1510

    accuracy                           0.43      9550
   macro avg       0.55      0.41      0.38      9550
weighted avg       0.54      0.43      0.38      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.35      0.33      0.34       963
           1       0.26      0.65      0.37       619
           2       0.80      0.15      0.26      1298
           3       0.28      0.53      0.37      1586
           4       0.38      0.38      0.38       984
           5       0.10      0.10      0.10      1439
           6       0.96      0.44      0.61      2304

    accuracy                           0.36      9193
   macro avg       0.45      0.37      0.35      9193
weighted avg       0.51      0.36      0.37      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.03      0.01      0.01       267
           1       0.70      0.89      0.78       666
           2       0.01      0.18      0.01        45
           3       0.84      0.02      0.04      1763
           4       0.25      0.69      0.37       188
           5       0.03      0.00      0.01      2151
           6       0.49      0.96      0.65      2196

    accuracy                           0.40      7276
   macro avg       0.33      0.39      0.27      7276
weighted avg       0.43      0.40      0.29      7276


=== Overall Accuracy ===
0.4296564697543368 [0.47923527259421883, 0.5351842987075156, 0.5380176020776223, 0.26993865030674846, 0.42921465968586386, 0.358533666920483, 0.39747113798790545]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.890556     0.577707     0.933154 0.540605 0.941957
  FUSHomozygous_Untreated  0.888475     0.546965     0.933293 0.518316 0.940112
   FUSRevertant_Untreated  0.885308     0.618814     0.916533 0.464872 0.953532
           OPTN_Untreated  0.804317     0.441263     0.888348 0.477740 0.872922
           TBK1_Untreated  0.875079     0.309653     0.949309 0.445048 0.912850
          TDP43_Untreated  0.739795     0.224097     0.851127 0.245265 0.835558
             WT_Untreated  0.823744     0.548003     0.883334 0.503751 0.900428
            Macro Average  0.843896     0.466643     0.907871 0.456514 0.908194
In [24]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'TDP43_Untreated':1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
              precision    recall  f1-score   support

           0       0.39      0.86      0.53      2576
           1       0.87      0.41      0.56      5889
           2       1.00      1.00      1.00      2467

    accuracy                           0.65     10932
   macro avg       0.75      0.76      0.70     10932
weighted avg       0.78      0.65      0.65     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
              precision    recall  f1-score   support

           0       0.50      0.86      0.63      1586
           1       0.92      0.69      0.79      4377
           2       1.00      0.99      0.99      2393

    accuracy                           0.81      8356
   macro avg       0.81      0.85      0.81      8356
weighted avg       0.87      0.81      0.82      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
              precision    recall  f1-score   support

           0       0.76      0.57      0.65      2049
           1       0.75      0.88      0.81      3078
           2       1.00      1.00      1.00      1804

    accuracy                           0.82      6931
   macro avg       0.84      0.82      0.82      6931
weighted avg       0.82      0.82      0.81      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
              precision    recall  f1-score   support

           0       0.80      0.05      0.09        81
           1       0.20      1.00      0.33        31
           2       1.00      0.02      0.04        51

    accuracy                           0.22       163
   macro avg       0.67      0.36      0.15       163
weighted avg       0.75      0.22      0.12       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
              precision    recall  f1-score   support

           0       0.54      0.62      0.58      2673
           1       0.70      0.62      0.66      3748
           2       1.00      1.00      1.00      3129

    accuracy                           0.75      9550
   macro avg       0.75      0.75      0.75      9550
weighted avg       0.75      0.75      0.75      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
              precision    recall  f1-score   support

           0       0.94      0.17      0.29      3602
           1       0.51      0.76      0.61      4009
           2       0.63      1.00      0.77      1582

    accuracy                           0.57      9193
   macro avg       0.69      0.64      0.56      9193
weighted avg       0.70      0.57      0.51      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
              precision    recall  f1-score   support

           0       0.37      0.99      0.54      2241
           1       0.93      0.08      0.15      4102
           2       1.00      1.00      1.00       933

    accuracy                           0.48      7276
   macro avg       0.77      0.69      0.56      7276
weighted avg       0.77      0.48      0.38      7276


=== Overall Accuracy ===
0.6129945008100668 [0.6478229052323454, 0.807084729535663, 0.8184966094358679, 0.22085889570552147, 0.7471204188481675, 0.5697813553790928, 0.47979659153380977]
=== Evaluation Metrics ===
        Label  Accuracy  Sensitivity  Specificity      PPV      NPV
            0  0.694605     0.622771     0.722901 0.469576 0.829498
            1  0.675789     0.550765     0.791917 0.710859 0.654916
            2  0.980687     0.993527     0.976724 0.929453 0.997959
Macro Average  0.783694     0.722354     0.830514 0.703296 0.827458
In [25]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'TDP43_Untreated':1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=True,
    norm=False,
    choose_features=False,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
              precision    recall  f1-score   support

           0       0.38      0.89      0.53      2576
           1       0.88      0.35      0.51      5889
           2       1.00      1.00      1.00      2467

    accuracy                           0.63     10932
   macro avg       0.75      0.75      0.68     10932
weighted avg       0.79      0.63      0.62     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
              precision    recall  f1-score   support

           0       0.46      0.89      0.61      1586
           1       0.93      0.63      0.75      4377
           2       1.00      0.99      1.00      2393

    accuracy                           0.78      8356
   macro avg       0.80      0.84      0.79      8356
weighted avg       0.86      0.78      0.80      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
              precision    recall  f1-score   support

           0       0.72      0.64      0.68      2049
           1       0.78      0.83      0.80      3078
           2       1.00      1.00      1.00      1804

    accuracy                           0.82      6931
   macro avg       0.83      0.82      0.83      6931
weighted avg       0.82      0.82      0.82      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
              precision    recall  f1-score   support

           0       0.88      0.09      0.16        81
           1       0.20      1.00      0.34        31
           2       1.00      0.04      0.08        51

    accuracy                           0.25       163
   macro avg       0.69      0.38      0.19       163
weighted avg       0.79      0.25      0.17       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
              precision    recall  f1-score   support

           0       0.52      0.73      0.60      2673
           1       0.73      0.51      0.60      3748
           2       1.00      1.00      1.00      3129

    accuracy                           0.73      9550
   macro avg       0.75      0.75      0.73      9550
weighted avg       0.76      0.73      0.73      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
              precision    recall  f1-score   support

           0       0.93      0.24      0.39      3602
           1       0.51      0.71      0.60      4009
           2       0.59      1.00      0.74      1582

    accuracy                           0.58      9193
   macro avg       0.68      0.65      0.57      9193
weighted avg       0.69      0.58      0.54      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
              precision    recall  f1-score   support

           0       0.37      0.99      0.54      2241
           1       0.95      0.07      0.13      4102
           2       1.00      1.00      1.00       933

    accuracy                           0.47      7276
   macro avg       0.77      0.69      0.56      7276
weighted avg       0.78      0.47      0.37      7276


=== Overall Accuracy ===
0.6082226606460679 [0.6268752286864252, 0.7826711345141216, 0.8195065647092772, 0.24539877300613497, 0.7320418848167539, 0.577178287827695, 0.47388675096206706]
=== Evaluation Metrics ===
        Label  Accuracy  Sensitivity  Specificity      PPV      NPV
            0  0.687468     0.681186     0.689942 0.463919 0.846011
            1  0.665503     0.494729     0.824125 0.723207 0.637155
            2  0.977462     0.994174     0.972304 0.917214 0.998154
Macro Average  0.776811     0.723363     0.828790 0.701447 0.827107
In [26]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'TDP43_Untreated':1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=True,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.19      0.36      0.25      2576
           1       0.54      0.33      0.41      5889
           2       1.00      0.99      1.00      2467

    accuracy                           0.49     10932
   macro avg       0.58      0.56      0.55     10932
weighted avg       0.56      0.49      0.51     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.25      0.20      0.22      1586
           1       0.72      0.78      0.75      4377
           2       1.00      0.98      0.99      2393

    accuracy                           0.73      8356
   macro avg       0.66      0.65      0.65      8356
weighted avg       0.71      0.73      0.72      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.42      0.10      0.16      2049
           1       0.60      0.91      0.72      3078
           2       1.00      0.98      0.99      1804

    accuracy                           0.69      6931
   macro avg       0.67      0.66      0.63      6931
weighted avg       0.65      0.69      0.63      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
Selecting top 100 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        81
           1       0.20      1.00      0.33        31
           2       0.83      0.10      0.18        51

    accuracy                           0.22       163
   macro avg       0.34      0.37      0.17       163
weighted avg       0.30      0.22      0.12       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.25      0.17      0.21      2673
           1       0.52      0.64      0.57      3748
           2       1.00      0.99      1.00      3129

    accuracy                           0.62      9550
   macro avg       0.59      0.60      0.59      9550
weighted avg       0.60      0.62      0.61      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.69      0.00      0.01      3602
           1       0.38      0.54      0.44      4009
           2       0.46      1.00      0.63      1582

    accuracy                           0.41      9193
   macro avg       0.51      0.51      0.36      9193
weighted avg       0.51      0.41      0.31      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
Selecting top 100 features...
              precision    recall  f1-score   support

           0       0.32      0.36      0.34      2241
           1       0.62      0.58      0.60      4102
           2       1.00      0.99      1.00       933

    accuracy                           0.57      7276
   macro avg       0.65      0.64      0.65      7276
weighted avg       0.58      0.57      0.57      7276


=== Overall Accuracy ===
0.5320315149283343 [0.4880168313208928, 0.7277405457156534, 0.6885009378156111, 0.22085889570552147, 0.6231413612565445, 0.409441966713804, 0.5665200659703133]
=== Evaluation Metrics ===
        Label  Accuracy  Sensitivity  Specificity      PPV      NPV
            0  0.612240     0.184630     0.780677 0.249021 0.708512
            1  0.573749     0.600460     0.548938 0.552872 0.596639
            2  0.961356     0.985112     0.954023 0.868650 0.995206
Macro Average  0.715782     0.590067     0.761213 0.556848 0.766786
In [27]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 1,
    'TDP43_Untreated':1,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    label_map = label_map,
    apply_pca=True
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2]
Test: (10932, 5568) Labels: [0 1 2]
2: 9892
0: 12232
1: 19345
              precision    recall  f1-score   support

           0       0.27      0.61      0.38      2576
           1       0.63      0.29      0.40      5889
           2       1.00      1.00      1.00      2467

    accuracy                           0.52     10932
   macro avg       0.63      0.63      0.59     10932
weighted avg       0.63      0.52      0.53     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2]
Test: (8356, 5568) Labels: [0 1 2]
2: 9966
0: 13222
1: 20857
              precision    recall  f1-score   support

           0       0.41      0.62      0.50      1586
           1       0.82      0.69      0.75      4377
           2       1.00      0.98      0.99      2393

    accuracy                           0.76      8356
   macro avg       0.75      0.76      0.74      8356
weighted avg       0.80      0.76      0.77      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2]
Test: (6931, 5568) Labels: [0 1 2]
2: 10555
0: 12759
1: 22156
              precision    recall  f1-score   support

           0       0.76      0.13      0.22      2049
           1       0.63      0.97      0.76      3078
           2       1.00      1.00      1.00      1804

    accuracy                           0.73      6931
   macro avg       0.80      0.70      0.66      6931
weighted avg       0.76      0.73      0.66      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2]
Test: (163, 5568) Labels: [0 1 2]
2: 12308
0: 14727
1: 25203
              precision    recall  f1-score   support

           0       1.00      0.01      0.02        81
           1       0.19      1.00      0.32        31
           2       1.00      0.02      0.04        51

    accuracy                           0.20       163
   macro avg       0.73      0.34      0.13       163
weighted avg       0.85      0.20      0.09       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2]
Test: (9550, 5568) Labels: [0 1 2]
2: 9230
0: 12135
1: 21486
              precision    recall  f1-score   support

           0       0.57      0.36      0.44      2673
           1       0.64      0.81      0.71      3748
           2       1.00      1.00      1.00      3129

    accuracy                           0.74      9550
   macro avg       0.74      0.72      0.72      9550
weighted avg       0.74      0.74      0.73      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2]
Test: (9193, 5568) Labels: [0 1 2]
2: 10777
0: 11206
1: 21225
              precision    recall  f1-score   support

           0       0.79      0.04      0.08      3602
           1       0.48      0.81      0.61      4009
           2       0.68      1.00      0.81      1582

    accuracy                           0.54      9193
   macro avg       0.65      0.62      0.50      9193
weighted avg       0.64      0.54      0.44      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2]
Test: (7276, 5568) Labels: [0 1 2]
2: 11426
0: 12567
1: 21132
              precision    recall  f1-score   support

           0       0.36      0.98      0.53      2241
           1       0.83      0.06      0.12      4102
           2       1.00      1.00      1.00       933

    accuracy                           0.46      7276
   macro avg       0.73      0.68      0.55      7276
weighted avg       0.71      0.46      0.36      7276


=== Overall Accuracy ===
0.5661394223845085 [0.5241492864983535, 0.7571804691239827, 0.7296205453758476, 0.20245398773006135, 0.7442931937172775, 0.540737517676493, 0.4645409565695437]
=== Evaluation Metrics ===
        Label  Accuracy  Sensitivity  Specificity      PPV      NPV
            0  0.638213     0.412480     0.727130 0.373213 0.758568
            1  0.622717     0.564794     0.676519 0.618576 0.625966
            2  0.983550     0.990857     0.981295 0.942362 0.997132
Macro Average  0.748160     0.656043     0.794981 0.644717 0.793889
In [ ]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 0,
    'TBK1_Untreated': 1,
    'OPTN_Untreated': 3,
    'TDP43_Untreated':4,
    'FUSHeterozygous_Untreated': 2,
    'FUSHomozygous_Untreated': 2,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=True,
    norm=False,
    choose_features=False,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    label_map = label_map
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4]
Test: (10932, 5568) Labels: [0 1 2 3 4]
2: 9892
0: 12232
3: 7535
1: 4205
4: 7605
              precision    recall  f1-score   support

           0       0.38      0.75      0.50      2576
           1       0.14      0.00      0.01      1876
           2       1.00      1.00      1.00      2467
           3       0.79      0.46      0.58      2314
           4       0.35      0.40      0.37      1699

    accuracy                           0.56     10932
   macro avg       0.53      0.52      0.49     10932
weighted avg       0.56      0.56      0.52     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4]
Test: (8356, 5568) Labels: [0 1 2 3 4]
2: 9966
0: 13222
3: 8200
1: 4861
4: 7796
              precision    recall  f1-score   support

           0       0.48      0.86      0.61      1586
           1       0.69      0.68      0.68      1220
           2       1.00      0.99      0.99      2393
           3       0.62      0.56      0.59      1649
           4       0.34      0.10      0.15      1508

    accuracy                           0.67      8356
   macro avg       0.62      0.64      0.61      8356
weighted avg       0.66      0.67      0.64      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4]
Test: (6931, 5568) Labels: [0 1 2 3 4]
2: 10555
0: 12759
3: 8746
1: 5036
4: 8374
              precision    recall  f1-score   support

           0       0.75      0.59      0.66      2049
           1       0.44      0.89      0.59      1045
           2       1.00      1.00      1.00      1804
           3       0.50      0.33      0.39      1103
           4       0.47      0.33      0.39       930

    accuracy                           0.67      6931
   macro avg       0.63      0.63      0.61      6931
weighted avg       0.69      0.67      0.66      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4]
Test: (163, 5568) Labels: [0 1 2 3 4]
2: 12308
0: 14727
3: 9844
1: 6068
4: 9291
              precision    recall  f1-score   support

           0       1.00      0.12      0.22        81
           1       0.50      0.08      0.13        13
           2       1.00      0.02      0.04        51
           3       0.05      0.80      0.10         5
           4       0.05      0.31      0.09        13

    accuracy                           0.12       163
   macro avg       0.52      0.27      0.12       163
weighted avg       0.86      0.12      0.14       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4]
Test: (9550, 5568) Labels: [0 1 2 3 4]
2: 9230
0: 12135
3: 8420
1: 5326
4: 7740
              precision    recall  f1-score   support

           0       0.47      0.43      0.45      2673
           1       0.71      0.13      0.22       755
           2       1.00      1.00      1.00      3129
           3       0.39      0.19      0.25      1429
           4       0.28      0.56      0.37      1564

    accuracy                           0.58      9550
   macro avg       0.57      0.46      0.46      9550
weighted avg       0.62      0.58      0.57      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4]
Test: (9193, 5568) Labels: [0 1 2 3 4]
2: 10777
0: 11206
3: 8263
1: 5097
4: 7865
In [ ]:
# Label mapping
label_map = {
    'WT_Untreated': 0,
    'FUSRevertant_Untreated': 1,
    'TBK1_Untreated': 2,
    'OPTN_Untreated': 3,
    'TDP43_Untreated':4,
    'FUSHeterozygous_Untreated': 5,
    'FUSHomozygous_Untreated': 6,
}

run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={'class_weight':{0: 10, 1: 1.0, 2: 1.0, 3: 1, 4: 1.0, 5: 1.0, 6:1}},
    label_map = label_map
)
In [ ]:
 
In [2]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=False,
    top_k=100,
    apply_pca=False,
    pca_components=50,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
    return_proba=False
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.66      0.91      0.76      1222
           1       0.87      0.53      0.66      1245
           2       0.81      0.91      0.86      1015
           3       0.79      0.50      0.61      2314
           4       0.13      0.00      0.00      1876
           5       0.28      0.30      0.29      1699
           6       0.28      0.71      0.40      1561

    accuracy                           0.50     10932
   macro avg       0.54      0.55      0.51     10932
weighted avg       0.52      0.50      0.47     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.63      0.64      0.63      1231
           1       0.62      0.59      0.60      1162
           2       0.48      0.99      0.64       800
           3       0.57      0.65      0.61      1649
           4       0.64      0.61      0.62      1220
           5       0.39      0.14      0.21      1508
           6       0.61      0.55      0.58       786

    accuracy                           0.57      8356
   macro avg       0.56      0.60      0.56      8356
weighted avg       0.56      0.57      0.54      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.73      0.97      0.84      1004
           1       0.94      0.57      0.71       800
           2       0.70      0.73      0.72      1131
           3       0.47      0.40      0.43      1103
           4       0.42      0.82      0.56      1045
           5       0.49      0.38      0.43       930
           6       0.57      0.17      0.26       918

    accuracy                           0.59      6931
   macro avg       0.62      0.58      0.56      6931
weighted avg       0.61      0.59      0.56      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.06      0.80      0.12         5
           4       0.50      0.08      0.13        13
           5       0.07      0.23      0.11        13
           6       0.62      0.97      0.76        37

    accuracy                           0.27       163
   macro avg       0.18      0.30      0.16       163
weighted avg       0.19      0.27      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.81      0.19      0.31      1567
           1       0.54      0.95      0.69      1562
           2       0.65      0.45      0.53      1163
           3       0.43      0.24      0.31      1429
           4       0.73      0.14      0.23       755
           5       0.29      0.64      0.40      1564
           6       0.43      0.36      0.39      1510

    accuracy                           0.45      9550
   macro avg       0.55      0.42      0.41      9550
weighted avg       0.54      0.45      0.42      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.19      0.27      0.22       963
           1       0.28      0.54      0.37       619
           2       0.79      0.11      0.19      1298
           3       0.18      0.44      0.26      1586
           4       0.34      0.24      0.28       984
           5       0.10      0.09      0.10      1439
           6       0.94      0.26      0.41      2304

    accuracy                           0.26      9193
   macro avg       0.40      0.28      0.26      9193
weighted avg       0.47      0.26      0.27      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.59      0.21      0.31       267
           1       0.75      0.94      0.84       666
           2       0.04      0.87      0.08        45
           3       0.96      0.03      0.05      1763
           4       0.28      0.45      0.35       188
           5       0.09      0.01      0.01      2151
           6       0.43      0.98      0.60      2196

    accuracy                           0.42      7276
   macro avg       0.45      0.50      0.32      7276
weighted avg       0.49      0.42      0.30      7276


=== Overall Accuracy ===
0.43597176371995305 [0.5032930845225028, 0.5667783628530397, 0.5854854999278604, 0.26993865030674846, 0.45057591623036647, 0.2599804198846949, 0.4157504123144585]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.896433     0.557006     0.942651 0.569429 0.939859
  FUSHomozygous_Untreated  0.910059     0.695016     0.938280 0.596414 0.959088
   FUSRevertant_Untreated  0.907292     0.591157     0.944334 0.554437 0.951720
           OPTN_Untreated  0.784909     0.382577     0.878032 0.420630 0.860023
           TBK1_Untreated  0.878094     0.333827     0.949547 0.464850 0.915664
          TDP43_Untreated  0.753383     0.240542     0.864097 0.276467 0.840518
             WT_Untreated  0.788057     0.542311     0.841166 0.424584 0.894784
            Macro Average  0.845461     0.477491     0.908301 0.472401 0.908808
In [3]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=True,
    norm=False,
    choose_features=False,
    top_k=100,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
              precision    recall  f1-score   support

           0       0.65      0.92      0.76      1222
           1       0.87      0.52      0.65      1245
           2       0.77      0.93      0.84      1015
           3       0.79      0.50      0.61      2314
           4       0.21      0.00      0.01      1876
           5       0.28      0.31      0.30      1699
           6       0.28      0.71      0.40      1561

    accuracy                           0.50     10932
   macro avg       0.55      0.56      0.51     10932
weighted avg       0.53      0.50      0.47     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
              precision    recall  f1-score   support

           0       0.62      0.64      0.63      1231
           1       0.62      0.57      0.59      1162
           2       0.45      0.99      0.62       800
           3       0.59      0.62      0.60      1649
           4       0.61      0.64      0.62      1220
           5       0.37      0.13      0.19      1508
           6       0.62      0.55      0.58       786

    accuracy                           0.56      8356
   macro avg       0.55      0.59      0.55      8356
weighted avg       0.55      0.56      0.53      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
              precision    recall  f1-score   support

           0       0.73      0.97      0.83      1004
           1       0.93      0.57      0.71       800
           2       0.69      0.75      0.72      1131
           3       0.47      0.34      0.40      1103
           4       0.40      0.83      0.53      1045
           5       0.49      0.34      0.40       930
           6       0.56      0.16      0.25       918

    accuracy                           0.57      6931
   macro avg       0.61      0.56      0.55      6931
weighted avg       0.60      0.57      0.55      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       1.00      0.04      0.07        26
           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        44
           3       0.07      0.80      0.12         5
           4       0.50      0.08      0.13        13
           5       0.07      0.23      0.11        13
           6       0.60      0.97      0.74        37

    accuracy                           0.28       163
   macro avg       0.32      0.30      0.17       163
weighted avg       0.34      0.28      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
              precision    recall  f1-score   support

           0       0.81      0.21      0.34      1567
           1       0.55      0.94      0.69      1562
           2       0.61      0.50      0.55      1163
           3       0.43      0.22      0.29      1429
           4       0.69      0.16      0.26       755
           5       0.30      0.65      0.41      1564
           6       0.43      0.35      0.39      1510

    accuracy                           0.46      9550
   macro avg       0.55      0.43      0.42      9550
weighted avg       0.53      0.46      0.43      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
              precision    recall  f1-score   support

           0       0.20      0.32      0.24       963
           1       0.27      0.51      0.36       619
           2       0.80      0.14      0.24      1298
           3       0.18      0.40      0.24      1586
           4       0.35      0.29      0.31       984
           5       0.10      0.09      0.09      1439
           6       0.94      0.27      0.42      2304

    accuracy                           0.27      9193
   macro avg       0.41      0.29      0.27      9193
weighted avg       0.47      0.27      0.28      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
              precision    recall  f1-score   support

           0       0.59      0.19      0.29       267
           1       0.75      0.95      0.84       666
           2       0.04      0.89      0.08        45
           3       0.96      0.03      0.05      1763
           4       0.28      0.46      0.35       188
           5       0.10      0.01      0.01      2151
           6       0.43      0.98      0.60      2196

    accuracy                           0.41      7276
   macro avg       0.45      0.50      0.32      7276
weighted avg       0.49      0.41      0.29      7276


=== Overall Accuracy ===
0.43583060784180494 [0.5038419319429198, 0.5594782192436573, 0.5730774779974029, 0.27607361963190186, 0.4568586387434555, 0.2668334602414881, 0.4146509070918087]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.895670     0.569268     0.940114 0.564147 0.941277
  FUSHomozygous_Untreated  0.910078     0.688436     0.939165 0.597601 0.958280
   FUSRevertant_Untreated  0.903647     0.618268     0.937086 0.535202 0.954443
           OPTN_Untreated  0.787828     0.359326     0.887009 0.423985 0.856767
           TBK1_Untreated  0.873991     0.351751     0.942552 0.445625 0.917187
          TDP43_Untreated  0.756512     0.235490     0.868993 0.279571 0.840387
             WT_Untreated  0.789546     0.537586     0.843997 0.426842 0.894131
            Macro Average  0.845325     0.480018     0.908417 0.467568 0.908925
In [6]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=True,
    norm=False,
    choose_features=True,
    top_k=10,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.44      0.59      0.50      1222
           1       0.38      0.23      0.29      1245
           2       0.16      0.12      0.14      1015
           3       0.20      0.07      0.11      2314
           4       0.22      0.09      0.13      1876
           5       0.26      0.05      0.08      1699
           6       0.11      0.41      0.17      1561

    accuracy                           0.20     10932
   macro avg       0.25      0.22      0.20     10932
weighted avg       0.24      0.20      0.18     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.51      0.17      0.26      1231
           1       0.48      0.72      0.58      1162
           2       0.14      0.32      0.19       800
           3       0.48      0.20      0.29      1649
           4       0.17      0.15      0.16      1220
           5       0.29      0.27      0.28      1508
           6       0.16      0.25      0.20       786

    accuracy                           0.29      8356
   macro avg       0.32      0.30      0.28      8356
weighted avg       0.34      0.29      0.28      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.55      0.27      0.36      1004
           1       0.46      0.70      0.56       800
           2       0.17      0.15      0.16      1131
           3       0.16      0.06      0.09      1103
           4       0.20      0.21      0.20      1045
           5       0.18      0.23      0.20       930
           6       0.18      0.28      0.22       918

    accuracy                           0.25      6931
   macro avg       0.27      0.27      0.26      6931
weighted avg       0.26      0.25      0.24      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 10 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.25      0.12      0.16        26
           1       0.14      0.76      0.24        25
           2       0.00      0.00      0.00        44
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        37

    accuracy                           0.13       163
   macro avg       0.06      0.13      0.06       163
weighted avg       0.06      0.13      0.06       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.49      0.51      0.50      1567
           1       0.48      0.44      0.46      1562
           2       0.16      0.25      0.20      1163
           3       0.29      0.15      0.20      1429
           4       0.14      0.06      0.08       755
           5       0.21      0.27      0.24      1564
           6       0.24      0.26      0.25      1510

    accuracy                           0.30      9550
   macro avg       0.29      0.28      0.27      9550
weighted avg       0.30      0.30      0.29      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.28      0.96      0.43       963
           1       0.26      0.14      0.18       619
           2       0.41      0.13      0.19      1298
           3       0.00      0.00      0.00      1586
           4       0.00      0.00      0.00       984
           5       0.00      0.00      0.00      1439
           6       0.12      0.02      0.04      2304

    accuracy                           0.13      9193
   macro avg       0.15      0.18      0.12      9193
weighted avg       0.13      0.13      0.09      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 10 features...
              precision    recall  f1-score   support

           0       0.13      0.12      0.12       267
           1       0.67      0.67      0.67       666
           2       0.01      0.40      0.02        45
           3       0.19      0.05      0.08      1763
           4       0.03      0.10      0.05       188
           5       0.34      0.30      0.32      2151
           6       0.41      0.36      0.39      2196

    accuracy                           0.28      7276
   macro avg       0.25      0.29      0.24      7276
weighted avg       0.34      0.28      0.30      7276


=== Overall Accuracy ===
0.22780189082915434 [0.20087815587266739, 0.2892532312111058, 0.2547972875486943, 0.13496932515337423, 0.2978010471204188, 0.13434134667681932, 0.28257284222100054]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.844564     0.470860     0.895449 0.380126 0.925530
  FUSHomozygous_Untreated  0.875747     0.481987     0.927421 0.465671 0.931705
   FUSRevertant_Untreated  0.794660     0.186317     0.865942 0.140044 0.900818
           OPTN_Untreated  0.702601     0.090161     0.844355 0.118227 0.800379
           TBK1_Untreated  0.829011     0.103766     0.924223 0.152379 0.887070
          TDP43_Untreated  0.757695     0.191208     0.879992 0.255934 0.834433
             WT_Untreated  0.674186     0.249893     0.765880 0.187435 0.825314
            Macro Average  0.782638     0.253456     0.871894 0.242831 0.872179
In [7]:
## Baseline
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=True,
    top_k=1000,
    label_map=None,
    apply_pca = True,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.57      0.89      0.70      1222
           1       0.77      0.35      0.48      1245
           2       0.55      0.67      0.61      1015
           3       0.52      0.49      0.50      2314
           4       0.06      0.01      0.01      1876
           5       0.28      0.17      0.21      1699
           6       0.21      0.52      0.30      1561

    accuracy                           0.41     10932
   macro avg       0.42      0.44      0.40     10932
weighted avg       0.40      0.41      0.37     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.53      0.48      0.50      1231
           1       0.49      0.52      0.51      1162
           2       0.33      0.72      0.45       800
           3       0.38      0.49      0.43      1649
           4       0.40      0.41      0.41      1220
           5       0.33      0.06      0.10      1508
           6       0.40      0.31      0.35       786

    accuracy                           0.41      8356
   macro avg       0.41      0.43      0.39      8356
weighted avg       0.41      0.41      0.38      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.67      0.84      0.75      1004
           1       0.73      0.49      0.58       800
           2       0.46      0.50      0.48      1131
           3       0.29      0.34      0.32      1103
           4       0.29      0.48      0.36      1045
           5       0.45      0.36      0.40       930
           6       0.24      0.03      0.06       918

    accuracy                           0.44      6931
   macro avg       0.45      0.43      0.42      6931
weighted avg       0.44      0.44      0.42      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 1000 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       1.00      0.08      0.14        26
           1       1.00      0.12      0.21        25
           2       0.00      0.00      0.00        44
           3       0.04      1.00      0.08         5
           4       0.00      0.00      0.00        13
           5       0.05      0.08      0.06        13
           6       0.89      0.46      0.61        37

    accuracy                           0.17       163
   macro avg       0.43      0.25      0.16       163
weighted avg       0.52      0.17      0.20       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.67      0.04      0.07      1567
           1       0.50      0.98      0.66      1562
           2       0.38      0.15      0.22      1163
           3       0.40      0.23      0.29      1429
           4       0.34      0.26      0.29       755
           5       0.25      0.48      0.33      1564
           6       0.32      0.33      0.32      1510

    accuracy                           0.37      9550
   macro avg       0.41      0.35      0.31      9550
weighted avg       0.42      0.37      0.32      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.18      0.33      0.23       963
           1       0.31      0.69      0.43       619
           2       0.22      0.03      0.05      1298
           3       0.02      0.06      0.03      1586
           4       0.16      0.03      0.05       984
           5       0.07      0.06      0.07      1439
           6       0.73      0.12      0.21      2304

    accuracy                           0.14      9193
   macro avg       0.24      0.19      0.15      9193
weighted avg       0.29      0.14      0.13      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 1000 features...
              precision    recall  f1-score   support

           0       0.54      0.76      0.63       267
           1       0.89      0.74      0.81       666
           2       0.04      0.93      0.08        45
           3       0.97      0.03      0.06      1763
           4       0.09      0.05      0.07       188
           5       0.06      0.00      0.01      2151
           6       0.41      0.97      0.58      2196

    accuracy                           0.40      7276
   macro avg       0.43      0.50      0.32      7276
weighted avg       0.48      0.40      0.29      7276


=== Overall Accuracy ===
0.3338935016889238 [0.4060556165386023, 0.4088080421254189, 0.43904198528350885, 0.17177914110429449, 0.37036649214659684, 0.13782225606439683, 0.40338097855964816]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.873304     0.493790     0.924980 0.472641 0.930650
  FUSHomozygous_Untreated  0.892502     0.638592     0.925824 0.530473 0.951268
   FUSRevertant_Untreated  0.863991     0.377365     0.921011 0.358886 0.926601
           OPTN_Untreated  0.714032     0.284699     0.813405 0.260983 0.830881
           TBK1_Untreated  0.854697     0.205723     0.939896 0.310037 0.900136
          TDP43_Untreated  0.761875     0.166595     0.890387 0.247051 0.831899
             WT_Untreated  0.751989     0.429875     0.821602 0.342429 0.869593
            Macro Average  0.816056     0.370948     0.891015 0.360357 0.891575
In [8]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=True,
    top_k=5,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.44      0.55      0.49      1222
           1       0.42      0.29      0.34      1245
           2       0.00      0.00      0.00      1015
           3       0.26      0.15      0.19      2314
           4       0.07      0.00      0.00      1876
           5       0.21      0.33      0.25      1699
           6       0.14      0.40      0.21      1561

    accuracy                           0.24     10932
   macro avg       0.22      0.25      0.21     10932
weighted avg       0.22      0.24      0.20     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 5 features...
              precision    recall  f1-score   support

           0       0.41      0.06      0.10      1231
           1       0.47      0.74      0.58      1162
           2       0.00      0.00      0.00       800
           3       0.29      0.21      0.25      1649
           4       0.18      0.01      0.02      1220
           5       0.26      0.36      0.30      1508
           6       0.13      0.49      0.21       786

    accuracy                           0.27      8356
   macro avg       0.25      0.27      0.21      8356
weighted avg       0.27      0.27      0.22      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.49      0.15      0.23      1004
           1       0.45      0.76      0.56       800
           2       0.00      0.00      0.00      1131
           3       0.17      0.15      0.16      1103
           4       0.07      0.00      0.00      1045
           5       0.18      0.53      0.27       930
           6       0.21      0.36      0.26       918

    accuracy                           0.25      6931
   macro avg       0.22      0.28      0.21      6931
weighted avg       0.21      0.25      0.19      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.13      0.72      0.23        25
           2       0.00      0.00      0.00        44
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        37

    accuracy                           0.11       163
   macro avg       0.02      0.10      0.03       163
weighted avg       0.02      0.11      0.03       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 5 features...
              precision    recall  f1-score   support

           0       0.47      0.59      0.52      1567
           1       0.41      0.26      0.32      1562
           2       0.00      0.00      0.00      1163
           3       0.24      0.36      0.29      1429
           4       0.03      0.00      0.00       755
           5       0.22      0.30      0.25      1564
           6       0.23      0.33      0.27      1510

    accuracy                           0.30      9550
   macro avg       0.23      0.26      0.24      9550
weighted avg       0.25      0.30      0.27      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.35      0.97      0.51       963
           1       0.16      0.13      0.15       619
           2       0.19      0.01      0.01      1298
           3       0.01      0.04      0.02      1586
           4       0.00      0.00      0.00       984
           5       0.00      0.00      0.00      1439
           6       0.15      0.01      0.01      2304

    accuracy                           0.12      9193
   macro avg       0.12      0.17      0.10      9193
weighted avg       0.11      0.12      0.07      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 5 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.23      0.46      0.31       267
           1       0.66      0.35      0.46       666
           2       0.00      0.00      0.00        45
           3       0.26      0.30      0.28      1763
           4       0.00      0.00      0.00       188
           5       0.36      0.39      0.38      2151
           6       0.41      0.38      0.40      2196

    accuracy                           0.35      7276
   macro avg       0.28      0.27      0.26      7276
weighted avg       0.36      0.35      0.35      7276


=== Overall Accuracy ===
0.2329581302927753 [0.23609586534943286, 0.26627573001436095, 0.25046890780551145, 0.11042944785276074, 0.2959162303664922, 0.1200913738714239, 0.35142935678944476]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.852274     0.458599     0.905878 0.398837 0.924745
  FUSHomozygous_Untreated  0.866758     0.424247     0.924831 0.425507 0.924471
   FUSRevertant_Untreated  0.894544     0.001456     0.999190 0.173913 0.895177
           OPTN_Untreated  0.636057     0.200426     0.736887 0.149886 0.799266
           TBK1_Untreated  0.882006     0.002138     0.997517 0.101562 0.883917
          TDP43_Untreated  0.696342     0.311801     0.779358 0.233763 0.839889
             WT_Untreated  0.669987     0.289089     0.752303 0.201422 0.830413
            Macro Average  0.785424     0.241108     0.870852 0.240699 0.871126
In [9]:
run_baseline_model(
    batches=[1, 2, 3, 7, 8, 9, 10],
    balance=False,
    norm=False,
    choose_features=True,
    top_k=20,
    label_map=None,
    classifier_class=cuMLLogisticRegression,
    classifier_kwargs={},
)
=== Batch 1 ===
Train: (41469, 5568) Labels: [0 1 2 3 4 5 6]
Test: (10932, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5058
FUSHomozygous_Untreated: 4834
FUSRevertant_Untreated: 4481
OPTN_Untreated: 7535
TBK1_Untreated: 4205
TDP43_Untreated: 7605
WT_Untreated: 7751
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.47      0.73      0.57      1222
           1       0.41      0.16      0.23      1245
           2       0.19      0.03      0.05      1015
           3       0.17      0.20      0.18      2314
           4       0.08      0.00      0.00      1876
           5       0.26      0.10      0.14      1699
           6       0.10      0.31      0.15      1561

    accuracy                           0.20     10932
   macro avg       0.24      0.22      0.19     10932
weighted avg       0.22      0.20      0.18     10932


=== Batch 2 ===
Train: (44045, 5568) Labels: [0 1 2 3 4 5 6]
Test: (8356, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5049
FUSHomozygous_Untreated: 4917
FUSRevertant_Untreated: 4696
OPTN_Untreated: 8200
TBK1_Untreated: 4861
TDP43_Untreated: 7796
WT_Untreated: 8526
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.45      0.18      0.25      1231
           1       0.46      0.67      0.54      1162
           2       0.23      0.05      0.09       800
           3       0.40      0.33      0.37      1649
           4       0.16      0.02      0.03      1220
           5       0.30      0.36      0.33      1508
           6       0.15      0.51      0.23       786

    accuracy                           0.31      8356
   macro avg       0.31      0.30      0.26      8356
weighted avg       0.32      0.31      0.28      8356


=== Batch 3 ===
Train: (45470, 5568) Labels: [0 1 2 3 4 5 6]
Test: (6931, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5276
FUSHomozygous_Untreated: 5279
FUSRevertant_Untreated: 4365
OPTN_Untreated: 8746
TBK1_Untreated: 5036
TDP43_Untreated: 8374
WT_Untreated: 8394
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.58      0.36      0.45      1004
           1       0.48      0.66      0.55       800
           2       0.32      0.01      0.02      1131
           3       0.18      0.11      0.14      1103
           4       0.10      0.01      0.01      1045
           5       0.18      0.39      0.24       930
           6       0.14      0.36      0.20       918

    accuracy                           0.25      6931
   macro avg       0.28      0.27      0.23      6931
weighted avg       0.28      0.25      0.21      6931


=== Batch 7 ===
Train: (52238, 5568) Labels: [0 1 2 3 4 5 6]
Test: (163, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6254
FUSHomozygous_Untreated: 6054
FUSRevertant_Untreated: 5452
OPTN_Untreated: 9844
TBK1_Untreated: 6068
TDP43_Untreated: 9291
WT_Untreated: 9275
Selecting top 20 features...
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.50      0.15      0.24        26
           1       0.13      0.72      0.23        25
           2       0.00      0.00      0.00        44
           3       0.05      0.20      0.08         5
           4       0.00      0.00      0.00        13
           5       0.00      0.00      0.00        13
           6       0.00      0.00      0.00        37

    accuracy                           0.14       163
   macro avg       0.10      0.15      0.08       163
weighted avg       0.10      0.14      0.07       163


=== Batch 8 ===
Train: (42851, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9550, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 4713
FUSHomozygous_Untreated: 4517
FUSRevertant_Untreated: 4333
OPTN_Untreated: 8420
TBK1_Untreated: 5326
TDP43_Untreated: 7740
WT_Untreated: 7802
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.49      0.51      0.50      1567
           1       0.49      0.45      0.47      1562
           2       0.11      0.02      0.03      1163
           3       0.29      0.27      0.28      1429
           4       0.03      0.00      0.00       755
           5       0.21      0.36      0.27      1564
           6       0.20      0.30      0.24      1510

    accuracy                           0.31      9550
   macro avg       0.26      0.27      0.26      9550
weighted avg       0.29      0.31      0.29      9550


=== Batch 9 ===
Train: (43208, 5568) Labels: [0 1 2 3 4 5 6]
Test: (9193, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 5317
FUSHomozygous_Untreated: 5460
FUSRevertant_Untreated: 4198
OPTN_Untreated: 8263
TBK1_Untreated: 5097
TDP43_Untreated: 7865
WT_Untreated: 7008
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.35      0.96      0.51       963
           1       0.14      0.17      0.16       619
           2       0.28      0.01      0.02      1298
           3       0.00      0.00      0.00      1586
           4       0.00      0.00      0.00       984
           5       0.00      0.00      0.00      1439
           6       0.13      0.01      0.01      2304

    accuracy                           0.12      9193
   macro avg       0.13      0.16      0.10      9193
weighted avg       0.12      0.12      0.07      9193


=== Batch 10 ===
Train: (45125, 5568) Labels: [0 1 2 3 4 5 6]
Test: (7276, 5568) Labels: [0 1 2 3 4 5 6]
FUSHeterozygous_Untreated: 6013
FUSHomozygous_Untreated: 5413
FUSRevertant_Untreated: 5451
OPTN_Untreated: 8086
TBK1_Untreated: 5893
TDP43_Untreated: 7153
WT_Untreated: 7116
Selecting top 20 features...
              precision    recall  f1-score   support

           0       0.17      0.24      0.20       267
           1       0.64      0.52      0.58       666
           2       0.02      0.31      0.04        45
           3       0.15      0.03      0.05      1763
           4       0.03      0.03      0.03       188
           5       0.33      0.35      0.34      2151
           6       0.36      0.46      0.40      2196

    accuracy                           0.31      7276
   macro avg       0.24      0.28      0.23      7276
weighted avg       0.31      0.31      0.29      7276


=== Overall Accuracy ===
0.23270411625965645 [0.2039882912550311, 0.306247008137865, 0.2481604386091473, 0.1411042944785276, 0.30607329842931935, 0.11563145871859024, 0.3077240241891149]
=== Evaluation Metrics ===
                    Label  Accuracy  Sensitivity  Specificity      PPV      NPV
FUSHeterozygous_Untreated  0.858934     0.518471     0.905293 0.427072 0.932465
  FUSHomozygous_Untreated  0.868399     0.441026     0.924485 0.433889 0.926485
   FUSRevertant_Untreated  0.875231     0.023472     0.975035 0.099231 0.894973
           OPTN_Untreated  0.649606     0.160524     0.762808 0.135429 0.796990
           TBK1_Untreated  0.876090     0.006085     0.990307 0.076132 0.883579
          TDP43_Untreated  0.726169     0.255804     0.827714 0.242733 0.837450
             WT_Untreated  0.632469     0.289197     0.706654 0.175634 0.821436
            Macro Average  0.783843     0.242083     0.870328 0.227160 0.870483
In [ ]: